In [1]:
import requests
import json
import pandas as pd
from pprint import pprint
import pickle

base_url = "https://api.openalex.org/"
email = "katz.562@osu.edu"


def pickle_dump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)


author_list = []
for j in range(10):
    try:
        author_list.extend(pickle.load(
            open(f"data/author_list_{j}.pkl", 'rb')))
        print(f"Author list {j} loaded from file.")
    except FileNotFoundError:
        sample = 10000
        per_page = 200
        while j < 10:
            new_author_list = []
            print(f"{j}th sample")
            seed = 3142+j
            i = 1
            while i <= sample/per_page:
                print(f"Page {i}")
                response = requests.get(base_url + "authors",
                                        params={'sample': sample, 'seed': seed, 'per-page': per_page, 'page': i})
                data = response.json()
                if len(data['results']) == 0:
                    break
                new_author_list.extend(data['results'])
                i += 1
            filename = f"data/author_list_{j}.json"
            print(f'Saving {len(new_author_list)} authors to {filename}')
            pickle_dump(new_author_list, f"data/author_list_{j}.pkl")
            j += 1

dropcolumns = ['orcid', 'display_name_alternatives', 'relevance_score', 'summary_stats', 'ids',
               'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'updated_date', 'created_date']
df = pd.DataFrame(author_list).drop(columns=dropcolumns)

Author list 0 loaded from file.
Author list 1 loaded from file.
Author list 2 loaded from file.
Author list 3 loaded from file.
Author list 4 loaded from file.
Author list 5 loaded from file.
Author list 6 loaded from file.
Author list 7 loaded from file.
Author list 8 loaded from file.
Author list 9 loaded from file.


In [None]:
# def list_columns(df):
#     return [col for col in df.columns if isinstance(df[col].iloc[0], list)]


# def dict_columns(df):
#     return [col for col in df.columns if isinstance(df[col].iloc[0], dict)]


# lcols = list_columns(df)
# dcols = dict_columns(df)

# serialized = {}
# for col in dcols:
#     serialized[col] = pd.json_normalize(df[col])
#     serialized[col]['id'] = df['id']

# exploded = {}
# for col in lcols:
#     exploded[col] = df[['id', col]].explode(col)

In [2]:
df['x_concepts'] = df['x_concepts'].apply(
    lambda x: [concept['display_name'] for concept in x])
x_concepts_df = df[['id', 'x_concepts']].explode('x_concepts')
x_concepts_df

Unnamed: 0,id,x_concepts
0,https://openalex.org/A5092356489,Humanities
0,https://openalex.org/A5092356489,Gynecology
0,https://openalex.org/A5092356489,Medicine
0,https://openalex.org/A5092356489,Forestry
0,https://openalex.org/A5092356489,Physics
...,...,...
99999,https://openalex.org/A5070158048,Agronomy
99999,https://openalex.org/A5070158048,Medicine
99999,https://openalex.org/A5070158048,Chemistry
99999,https://openalex.org/A5070158048,Genetics


In [35]:
counts_by_year_df = df[['id', 'counts_by_year']].explode('counts_by_year')
counts_by_year_df = counts_by_year_df[counts_by_year_df['counts_by_year'].notna(
)]
counts_by_year_df['year'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['year'])
counts_by_year_df['works_count'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['works_count'])
counts_by_year_df['cited_by_count'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['cited_by_count'])

counts_by_year_df.drop(columns='counts_by_year', inplace=True)
counts_by_year_df

Unnamed: 0,id,year,works_count,cited_by_count
0,https://openalex.org/A5092356489,2024,0,1
0,https://openalex.org/A5092356489,2023,2,0
1,https://openalex.org/A5087968922,2022,0,2
1,https://openalex.org/A5087968922,2018,0,1
2,https://openalex.org/A5114128731,2024,2,0
...,...,...,...,...
99999,https://openalex.org/A5070158048,2021,0,3
99999,https://openalex.org/A5070158048,2020,1,2
99999,https://openalex.org/A5070158048,2018,3,3
99999,https://openalex.org/A5070158048,2016,1,1


In [26]:
import os
import logging
from openai import OpenAI
from pprint import pprint
# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_gender_predictions(names_list):
    """
    Takes a list of names and returns a list of gender predictions.

    :param names_list: List of names (strings) to predict genders for.
    :return: List of predicted genders corresponding to each name.
    """
    # Join the names into a single string, separated by commas
    input_string = ", ".join(names_list)

    # Create a single response for the list of names
    response = client.responses.create(
        model='gpt-3.5-turbo',

        instructions=(
            'You are a helpful assistant that determines the most likely gender of a list of persons with specified names. '
            'Respond with a comma-separated list of genders corresponding to each name: "Male", "Female", or "Unknown".'
        ),
        input=f"Names: {input_string}."
    )

    # Log the response
    logging.info(f"Response: {response}")
    # Extract the response text and split it into a list of genders
    genders = response.output_text.strip().split(", ")

    return genders


# # Example usage
names = ["Alice", "Bob", "Charlie"]
genders = get_gender_predictions(names)

In [27]:
import re
import glob
df['strip_display_name'] = df['display_name'].str.replace(
    '\.\s?', ' ', regex=True)

# Filter the DataFrame
filtered_df = df[~df['strip_display_name'].str.match(
    r'^\b\w{1,2}\b ')]

name_to_gender = json.load(open('data/gender_predictions_10000.json', 'r'))

unique_names = filtered_df['strip_display_name'].unique()


chunk_size = 500
for i in range(len(name_to_gender), len(unique_names), chunk_size):
    chunk = unique_names[i:i + chunk_size]
    print(
        f"Processing chunk {i // chunk_size + 1} of {len(unique_names) // chunk_size + 1}")
    predicted_genders = get_gender_predictions(chunk)
    name_to_gender.update(dict(zip(chunk, predicted_genders)))
    if (i // chunk_size) % 10 == 0:
        print(f"Saving progress after processing {i} names.")
        with open(f'data/gender_predictions_{i}.json', 'w') as f:
            json.dump(name_to_gender, f)

Processing chunk 21 of 143
Saving progress after processing 10472 names.
Processing chunk 22 of 143
Processing chunk 23 of 143
Processing chunk 24 of 143
Processing chunk 25 of 143
Processing chunk 26 of 143
Processing chunk 27 of 143
Processing chunk 28 of 143
Processing chunk 29 of 143
Processing chunk 30 of 143
Processing chunk 31 of 143
Saving progress after processing 15472 names.
Processing chunk 32 of 143
Processing chunk 33 of 143
Processing chunk 34 of 143
Processing chunk 35 of 143
Processing chunk 36 of 143
Processing chunk 37 of 143
Processing chunk 38 of 143
Processing chunk 39 of 143
Processing chunk 40 of 143
Processing chunk 41 of 143
Saving progress after processing 20472 names.
Processing chunk 42 of 143
Processing chunk 43 of 143
Processing chunk 44 of 143
Processing chunk 45 of 143
Processing chunk 46 of 143
Processing chunk 47 of 143
Processing chunk 48 of 143
Processing chunk 49 of 143
Processing chunk 50 of 143
Processing chunk 51 of 143
Saving progress after pro

In [28]:
json.dump(name_to_gender, open('data/gender_predictions.json', 'w'))

In [29]:
filtered_df['predicted_gender'] = filtered_df['strip_display_name'].map(
    name_to_gender)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['predicted_gender'] = filtered_df['strip_display_name'].map(


Unnamed: 0,id,display_name,works_count,cited_by_count,x_concepts,counts_by_year,works_api_url,strip_display_name,predicted_gender
0,https://openalex.org/A5092356489,Resty Fastabikul Khaerat,2,1,"[Humanities, Gynecology, Medicine, Forestry, P...","[{'year': 2024, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Resty Fastabikul Khaerat,Male
2,https://openalex.org/A5114128731,Xinshun Li,2,0,"[Agronomy, Pyrolysis, Chemical engineering, Bi...","[{'year': 2024, 'works_count': 2, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Xinshun Li,Male
8,https://openalex.org/A5004616759,堀越さな恵,1,0,[Computer science],"[{'year': 2013, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,堀越さな恵,Female
9,https://openalex.org/A5068802092,Debbie Northin,1,3,"[Physical therapy, Neurosurgery, Psychology, S...",[],https://api.openalex.org/works?filter=author.i...,Debbie Northin,Female
10,https://openalex.org/A5095940583,Ambrish,1,4,"[Structural geology, Geochemistry, Ecology, En...","[{'year': 2025, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Ambrish,Male
...,...,...,...,...,...,...,...,...,...
99994,https://openalex.org/A5104421392,Wes Romanello,1,0,"[Psychology, Genetics, Biochemistry, Medicine,...","[{'year': 2024, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Wes Romanello,Male
99995,https://openalex.org/A5084612149,Felix Valdés García,26,12,"[Art, Philosophy, Humanities, Political scienc...","[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Felix Valdés García,Unknown
99996,https://openalex.org/A5052322343,أحمد محمد أحمد سلیمان,1,0,[Computer science],"[{'year': 2017, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,أحمد محمد أحمد سلیمان,Female.
99997,https://openalex.org/A5018515218,Esther Eshel,78,278,"[History, Philosophy, Art, Geography, Archaeol...","[{'year': 2025, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,Esther Eshel,


In [31]:
gender_df = filtered_df[filtered_df['predicted_gender']
                        != 'Unknown'][['id', 'predicted_gender']]
gender_df

Unnamed: 0,id,predicted_gender
0,https://openalex.org/A5092356489,Male
2,https://openalex.org/A5114128731,Male
8,https://openalex.org/A5004616759,Female
9,https://openalex.org/A5068802092,Female
10,https://openalex.org/A5095940583,Male
...,...,...
99993,https://openalex.org/A5025150732,Male
99994,https://openalex.org/A5104421392,Male
99996,https://openalex.org/A5052322343,Female.
99997,https://openalex.org/A5018515218,


In [41]:
cited_by_count_df = df[['id', 'cited_by_count']]
works_api_url_df = df[['id', 'works_api_url']]
display_name_df = df[['id', 'display_name']]

In [43]:
from re import X

dfs = [counts_by_year_df, x_concepts_df, gender_df,
       cited_by_count_df, works_api_url_df, display_name_df]
for _ in dfs:
    if not os.path.exists(f"data/{_.columns[1]}.jsonl"):
        _.to_json(f"data/{_.columns[1]}.jsonl", orient='records', lines=True)

In [None]:
import json


def convert_json_to_jsonl(input_file, output_file):
    # Open the JSON file and load the data
    with open(input_file, 'r') as f:
        data = json.load(f)

    # Ensure the data is a list of dictionaries
    if isinstance(data, dict):
        data = [data]

    # Open the output file in write mode
    with open(output_file, 'w') as f_out:
        for entry in data:
            # Convert each dictionary to a JSON string and write it to the file
            json_line = json.dumps(entry)
            f_out.write(json_line + '\n')


# Usage
input_json_file = 'data.json'
output_jsonl_file = 'data.jsonl'
convert_json_to_jsonl(input_json_file, output_jsonl_file)