In [1]:
import requests
import json
import pandas as pd
from pprint import pprint
import pickle

base_url = "https://api.openalex.org/"
email = "katz.562@osu.edu"


def pickle_dump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)


author_list = []
for j in range(10):
    try:
        author_list.extend(pickle.load(
            open(f"data/author_list_{j}.pkl", 'rb')))
        print(f"Author list {j} loaded from file.")
    except FileNotFoundError:
        sample = 10000
        per_page = 200
        while j < 10:
            new_author_list = []
            print(f"{j}th sample")
            seed = 3142+j
            i = 1
            while i <= sample/per_page:
                print(f"Page {i}")
                response = requests.get(base_url + "authors",
                                        params={'sample': sample, 'seed': seed, 'per-page': per_page, 'page': i})
                data = response.json()
                if len(data['results']) == 0:
                    break
                new_author_list.extend(data['results'])
                i += 1
            filename = f"data/author_list_{j}.json"
            print(f'Saving {len(new_author_list)} authors to {filename}')
            pickle_dump(new_author_list, f"data/author_list_{j}.pkl")
            j += 1

dropcolumns = ['orcid', 'display_name_alternatives', 'relevance_score', 'summary_stats', 'ids',
               'affiliations', 'last_known_institutions', 'topics', 'topic_share', 'updated_date', 'created_date']
df = pd.DataFrame(author_list).drop(columns=dropcolumns)

Author list 0 loaded from file.
Author list 1 loaded from file.
Author list 2 loaded from file.
Author list 3 loaded from file.
Author list 4 loaded from file.
Author list 5 loaded from file.
Author list 6 loaded from file.
Author list 7 loaded from file.
Author list 8 loaded from file.
Author list 9 loaded from file.


In [None]:
# def list_columns(df):
#     return [col for col in df.columns if isinstance(df[col].iloc[0], list)]


# def dict_columns(df):
#     return [col for col in df.columns if isinstance(df[col].iloc[0], dict)]


# lcols = list_columns(df)
# dcols = dict_columns(df)

# serialized = {}
# for col in dcols:
#     serialized[col] = pd.json_normalize(df[col])
#     serialized[col]['id'] = df['id']

# exploded = {}
# for col in lcols:
#     exploded[col] = df[['id', col]].explode(col)

In [2]:
df['x_concepts'] = df['x_concepts'].apply(
    lambda x: [concept['display_name'] for concept in x])
x_concepts_df = df[['id', 'x_concepts']].explode('x_concepts')
x_concepts_df

Unnamed: 0,id,x_concepts
0,https://openalex.org/A5092356489,Humanities
0,https://openalex.org/A5092356489,Gynecology
0,https://openalex.org/A5092356489,Medicine
0,https://openalex.org/A5092356489,Forestry
0,https://openalex.org/A5092356489,Physics
...,...,...
99999,https://openalex.org/A5070158048,Agronomy
99999,https://openalex.org/A5070158048,Medicine
99999,https://openalex.org/A5070158048,Chemistry
99999,https://openalex.org/A5070158048,Genetics


In [3]:
counts_by_year_df = df[['id', 'counts_by_year']].explode('counts_by_year')
counts_by_year_df = counts_by_year_df[counts_by_year_df['counts_by_year'].notna(
)]
counts_by_year_df['year'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['year'])
counts_by_year_df['works_count'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['works_count'])
counts_by_year_df['cited_by_count'] = counts_by_year_df['counts_by_year'].apply(
    lambda x: x['cited_by_count'])

counts_by_year_df.drop(columns='counts_by_year', inplace=True)
counts_by_year_df

Unnamed: 0,id,year,works_count,cited_by_count
0,https://openalex.org/A5092356489,2024,0,1
0,https://openalex.org/A5092356489,2023,2,0
1,https://openalex.org/A5087968922,2022,0,2
1,https://openalex.org/A5087968922,2018,0,1
2,https://openalex.org/A5114128731,2024,2,0
...,...,...,...,...
99999,https://openalex.org/A5070158048,2021,0,3
99999,https://openalex.org/A5070158048,2020,1,2
99999,https://openalex.org/A5070158048,2018,3,3
99999,https://openalex.org/A5070158048,2016,1,1


In [None]:
import os
import logging
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_gender_predictions(names_list):
    """
    Takes a list of names and returns a list of gender predictions.

    :param names_list: List of names (strings) to predict genders for.
    :return: List of predicted genders corresponding to each name.
    """
    # Join the names into a single string, separated by commas
    input_string = ", ".join(names_list)

    # Create a single response for the list of names
    response = client.responses.create(
        model='gpt-4',
        instructions=(
            'You are a helpful assistant that determines the most likely gender of a list of persons with specified names. '
            'Respond with a comma-separated list of genders corresponding to each name: "Male", "Female", or "Unknown".'
        ),
        input=f"Names: {input_string}."
    )

    # Log the response
    logging.info(f"Response: {response}")

    # Extract the response text and split it into a list of genders
    genders = response.output_text.strip().split(", ")

    return genders


# # Example usage
names = ["Alice", "Bob", "Charlie"]
genders = get_gender_predictions(names)

In [None]:
import re
df['strip_display_name'] = df['display_name'].str.replace(
    '\.\s?', ' ', regex=True)

# Filter the DataFrame
filtered_df = df[~df['strip_display_name'].str.match(
    r'^\b\w{1,2}\b ')]
name_to_gender = {}
unique_names = filtered_df['strip_display_name'].unique()

chunk_size = 500
for i in range(0, len(unique_names), chunk_size):
    chunk = unique_names[i:i + chunk_size]
    predicted_genders = get_gender_predictions(chunk)
    name_to_gender.update(dict(zip(chunk, predicted_genders)))
    if (i // chunk_size) % 10 == 0:
        with open(f'data/gender_predictions_{i}.json', 'w') as f:
            json.dump(name_to_gender, f)