In [1]:
from serpapi.google_scholar_search import GoogleScholarSearch
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import random
import json
import time
import os
_ = load_dotenv("serpapi.env")
API_KEY = os.environ.get("API_KEY")

In [2]:
soced_issns = ["1939-8573","0038-0407"]
# Load the ERIC API results for query "Sociology of Education"
with open("./eric/soced1.csv", "r") as infile:
    sE = pd.read_csv(infile)
sE = sE[sE.issn.notnull()]
sE = sE[sE.issn.str.contains("|".join(soced_issns), regex=True)]
print(f"Loaded {len(sE)} ERIC records")

Loaded 605 ERIC records


In [4]:
for row in tqdm(sE.iterrows()):
    row = row[1]
    # create a directory for each record
    if not os.path.exists("data/serpapi/" + row.id):
        os.mkdir("data/serpapi/" + row.id)
    query = row.title + " " + row.description + " " + row.author
    search = GoogleScholarSearch({"q": query, "api_key": API_KEY})
    results = search.get_dict()
    with open("data/serpapi/" + row.id + "/record.json", "w") as outfile:
        json.dump(results, outfile)
    # random sleep pattern on each iteration
    time.sleep(random.randint(1,3))

0it [00:00, ?it/s]


In [5]:
error_index = []
for i, row in enumerate(sE.iterrows()):
    row = row[1]
    with open("data/serpapi/" + row.id + "/record.json", "r") as infile:
        results = json.load(infile)
    if results["search_information"]["organic_results_state"] == "Fully empty":
        error_index.append(i)
print(f"Found {len(error_index)} errors")

Found 51 errors


In [18]:
error_index = []
for i, row in enumerate(sE.iterrows()):
    row = row[1]
    with open("data/serpapi/" + row.id + "/record.json", "r") as infile:
        results = json.load(infile)
    if "organic_results" not in results.keys():
        error_index.append(i)
print(f"Found {len(error_index)} errors")

Found 51 errors


In [None]:
for i, row in enumerate(sE.iloc[error_index].iterrows()):
    row = row[1]
    query = row.title + ", " + row.author
    search = GoogleScholarSearch({"q": query, "api_key": API_KEY})
    results = search.get_dict()
    with open("data/serpapi/" + row.id + "/record.json", "w") as outfile:
        json.dump(results, outfile)
    # random sleep pattern on each iteration
    time.sleep(random.randint(1,3))

In [20]:
error_index = []
for i, row in enumerate(sE.iterrows()):
    row = row[1]
    with open("data/serpapi/" + row.id + "/record.json", "r") as infile:
        results = json.load(infile)
    if "organic_results" not in results.keys():
        error_index.append(i)
print(f"Found {len(error_index)} errors")

Found 10 errors


In [25]:
sE.iloc[error_index]

Unnamed: 0,id,title,description,author,subject,publicationtype,publicationdateyear,issn,isbn,publisher,peerreviewed,language
12,EJ1172898,Providing a &apos;&apos;Leg Up&apos;&apos;: Pa...,Although higher education scholars are increas...,"Hamilton\, Laura,Roksa\, Josipa,Nielsen\, Kelly","Parent Participation,Social Differences,Studen...","Journal Articles,Reports - Research",2018,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
195,EJ1202300,&apos;&apos;I Can Turn It on When I Need To&ap...,Drawing on interviews with 38 black and Latino...,"Johnson\, Anthony M.","Hispanic American Students,African American St...","Journal Articles,Reports - Research",2019,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
397,EJ889480,School Strategies and the &quot;College-Linkin...,This study reconsidered school effects on coll...,"Hill\, Lori Diane","School Effectiveness,Clearinghouses,Enrollment...","Journal Articles,Reports - Research",2008,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
409,EJ889587,Are &quot;Failing&quot; Schools Really Failing...,"To many, it seems obvious which schools are fa...","Downey\, Douglas B.,von Hippel\, Paul T.,Hughe...","Low Achievement,Disadvantaged,Academic Achieve...","Journal Articles,Reports - Evaluative",2008,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
462,EJ697169,From &quot;Middle Class&quot; to &quot;Trailer...,This article explores how teachers perceived a...,"Morris\, Edward W.","Middle Class,White Students,Minority Group Chi...","Journal Articles,Reports - Research",2005,ISSN-0038-0407,,"American Sociological Association, 1307 New Yo...",T,English
536,EJ679898,Are All &quot;Adolescent Econometricians&quot;...,Examines whether economic conditions similarly...,"Beattie\, Irenee R.","Adolescents,College Attendance,Educational Sta...","Journal Articles,Reports - Research",2002,ISSN-0038-0407,,,T,English
566,EJ679928,Reassessing the &quot;Burden of 'Acting White'...,Discusses the idea that black students underpe...,"Horvat\, Erin McNamara,Lewis\, Kristine S.","Academic Achievement,Black Achievement,Black C...","Journal Articles,Reports - Descriptive",2003,ISSN-0038-0407,,,T,English
927,EJ1351618,Diffusing &quot;Destandardization&quot; Reform...,The education sector in low- and middle-income...,"Hossain\, Mobarak","Foreign Countries,Developing Nations,Neolibera...","Journal Articles,Reports - Research",2022,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
933,EJ1373195,The Relationship between Ninth Graders&apos; P...,"Using data on ninth graders, math teachers, an...","Shifrer\, Dara,Phillippo\, Kate,Tilbrook\, Ned...","Grade 9,Student Teachers,Mathematics Teachers,...","Journal Articles,Reports - Research",2023,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English
934,EJ1373204,Equalization or Reproduction? &quot;Some Colle...,What are the economic consequences of college ...,"Payne\, Sarah S. C.","Socialization,Higher Education,Role of Educati...","Journal Articles,Reports - Research",2023,ISSN-0038-0407,,"SAGE Publications. 2455 Teller Road, Thousand ...",T,English


In [None]:
import html
for i, row in enumerate(sE.iloc[error_index].iterrows()):
    row = row[1]
    query = html.unescape(row.title + ", " + row.author)
    search = GoogleScholarSearch({"q": query, "api_key": API_KEY})
    results = search.get_dict()
    with open("data/serpapi/" + row.id + "/record.json", "w") as outfile:
        json.dump(results, outfile)
    # random sleep pattern on each iteration
    time.sleep(random.randint(1,3))

In [27]:
error_index = []
for i, row in enumerate(sE.iterrows()):
    row = row[1]
    with open("data/serpapi/" + row.id + "/record.json", "r") as infile:
        results = json.load(infile)
    if "organic_results" not in results.keys():
        error_index.append(i)
print(f"Found {len(error_index)} errors")

Found 0 errors
