In [49]:
from serpapi.google_scholar_search import GoogleScholarSearch
from numpy.random import default_rng
from dotenv import load_dotenv
from time import sleep
from tqdm import tqdm
from collections import defaultdict
import html
import json
import os
_ = load_dotenv("serpapi.env")
API_KEY = os.environ.get("API_KEY")

In [3]:
with open("../data/eric/se.json", "r") as infile:
    se = json.load(infile)
records = se["response"]["docs"]
print(f"Soc of Ed articles: {len(records)}")

Soc of Ed articles: 1035


In [4]:
ids = [r["id"] for r in records]
print(f"Unique IDs: {len(set(ids))}")

Unique IDs: 1035


In [5]:
output_dir = "../data/eric/serpapi"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [6]:
print(records[0])

{'id': 'EJ1158833', 'title': 'Gender Differences in Context: The Impact of Track Position on Study Involvement in Flemish Secondary Education', 'author': ['Van Houtte, Mieke'], 'description': "This study examines whether the influence of track position on study involvement is gendered and whether gender differences in study involvement according to track position are associated with school misconduct and rather poor future perspectives. Three-level analyses (HLM 6) of data gathered in 2004-2005 from 11,872 third- and fifth-grade students in 146 tracks in a representative sample of 85 secondary schools in Flanders (Belgium) confirmed the impact of tracking on boys' as well as girls' study involvement. Boys are, generally, less involved in studying than girls, and boys are more affected by track position than girls are, enlarging the gender gap in the lower tracks. In these tracks, boys are more prone to misconduct and rather poor future perspectives. Finally, girls in arts tracks are, o

In [None]:
for rec in tqdm(records):
    id = html.unescape(rec["id"])
    title = html.unescape(rec["title"])
    authors = html.unescape(
        " ".join([a for a in rec["author"]]))
    query = " ".join([title, authors, "Sociology of Education"])
    search = GoogleScholarSearch({"q": query, "api_key": API_KEY})
    results = search.get_dict()
    with open(f"{output_dir}/{id}.json", "w") as outfile:
        json.dump(results, outfile)
    # random sleep pattern on each iteration
    sleep(1 + default_rng().uniform(0, 1))

In [34]:
citeD = {}
for id in ids:
    with open(f"{output_dir}/{id}.json", "r") as infile:
        result = json.load(infile)
    try:
        citeD[id] = result["organic_results"][0]["inline_links"]["cited_by"]["total"]
    except KeyError:
        citeD[id] = 0
# order the ids by citation count
ids = sorted(ids, key=lambda x: citeD[x], reverse=True)
for id in ids[:100]:
    with open(f"{output_dir}/{id}.json", "r") as infile:
        result = json.load(infile)
    print(citeD[id], " - ", id, " - ", result["organic_results"][0]["title"])

3820  -  EJ353123  -  Social class differences in family-school relationships: The importance of cultural capital
3482  -  EJ560204  -  Effects of college transition and perceptions of the campus racial climate on Latino college students' sense of belonging
2810  -  EJ590423  -  Moments of social inclusion and exclusion race, class, and cultural capital in family-school relationships
2265  -  EJ533315  -  Effects of parental involvement on eighth-grade achievement
1939  -  EJ406199  -  The attitude-achievement paradox among Black adolescents
1905  -  EJ551245  -  From first grade forward: Early foundations of high school dropout
1771  -  EJ679899  -  Cultural capital, gender, and school success: The role of habitus
1648  -  EJ455101  -  World expansion of mass education, 1870-1980
1638  -  EJ502246  -  Social capital and the reproduction of inequality: Information networks among Mexican-origin high school students
1625  -  EJ473710  -  Maximally maintained inequality: Expansion, reform

ED296041  -  Dropping out of high school and drug involvement is a duplicate

In [40]:
print(len({k:v for k, v in citeD.items() if k != "ED296041"}))

1034


In [41]:
with open(f"{output_dir}/SE_citations.json", "w") as outfile:
    json.dump({k:v for k, v in citeD.items() if k != "ED296041"}, outfile)

In [47]:
for id in ids[:10]:
    with open(f"{output_dir}/{id}.json", "r") as infile:
        result = json.load(infile)
    print(citeD[id], " - ", id, " - ", result["organic_results"][0]["title"])

3820  -  EJ353123  -  Social class differences in family-school relationships: The importance of cultural capital
3482  -  EJ560204  -  Effects of college transition and perceptions of the campus racial climate on Latino college students' sense of belonging
2810  -  EJ590423  -  Moments of social inclusion and exclusion race, class, and cultural capital in family-school relationships
2265  -  EJ533315  -  Effects of parental involvement on eighth-grade achievement
1939  -  EJ406199  -  The attitude-achievement paradox among Black adolescents
1905  -  EJ551245  -  From first grade forward: Early foundations of high school dropout
1771  -  EJ679899  -  Cultural capital, gender, and school success: The role of habitus
1648  -  EJ455101  -  World expansion of mass education, 1870-1980
1638  -  EJ502246  -  Social capital and the reproduction of inequality: Information networks among Mexican-origin high school students
1625  -  EJ473710  -  Maximally maintained inequality: Expansion, reform

In [51]:
exampleD = defaultdict(dict)
for rec in records:
    if rec["id"] in ids[:10]:
        exampleD[rec["id"]] = {"input": rec, "output": ""}

In [52]:
exampleD

defaultdict(dict,
            {'EJ353123': {'input': {'id': 'EJ353123',
               'title': 'Social Class Differences in Family-School Relationships: The Importance of Cultural Capital.',
               'author': ['Lareau, Annette'],
               'description': "Summarizes a qualitative study of family/school relationships in White working class and middle class areas. Concludes that schools have standardized views of the proper role of parents in schooling. Suggests that the concept of cultural capital is useful to understand social class differences in children's school experiences. (Author/RKM)",
               'subject': ['Cultural Differences',
                'Educational Research',
                'Elementary Education',
                'Family (Sociological Unit)',
                'Family School Relationship',
                'Middle Class',
                'Parent School Relationship',
                'Social Attitudes',
                'Social Class',
                'S

In [53]:
exampleD["EJ353123"]["output"] = {"quantitative": False, "qualitative": True, "primary/secondary": True, "tertiary": False, "inequality": False, "nonstructural": False, "culture": True, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ679899"]["output"] = {"quantitative": False, "qualitative": True, "primary/secondary": True, "tertiary": False, "inequality": False, "nonstructural": False, "culture": True, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ533315"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": False, "nonstructural": False, "culture": True, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ551245"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": False, "nonstructural": False, "culture": False, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ560204"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": False, "tertiary": True, "inequality": True, "nonstructural": False, "culture": False, "school": False, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ590423"]["output"] = {"quantitative": False, "qualitative": True, "primary/secondary": True, "tertiary": False, "inequality": True, "nonstructural": False, "culture": True, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ502246"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": True, "nonstructural": False, "culture": False, "school": False, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ473710"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": True, "nonstructural": False, "culture": True, "school": True, "state": False, "labor": False, "comparative": False, "methods": False}
exampleD["EJ455101"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": False, "nonstructural": False, "culture": False, "school": False, "state": True, "labor": False, "comparative": False, "methods": False}
exampleD["EJ406199"]["output"] = {"quantitative": True, "qualitative": False, "primary/secondary": True, "tertiary": False, "inequality": True, "nonstructural": False, "culture": False, "school": False, "state": False, "labor": False, "comparative": False, "methods": False}

In [56]:
with open("../prompts/examples/manual.json", "w") as outfile:
    json.dump(exampleD, outfile)