In [None]:
def get_attributes(object):
    attributes = dir(object)
    filtered_attributes = [
        attr for attr in attributes if not attr.startswith('__')]
    return filtered_attributes


def get_methods(object):
    methods = dir(object)
    filtered_methods = [
        method for method in methods if not method.startswith('__') and callable(getattr(object, method))]
    return filtered_methods


def get_properties(object):
    properties = dir(object)
    filtered_properties = [
        prop for prop in properties if not prop.startswith('__') and not callable(getattr(object, prop))]
    return filtered_properties

In [3]:
import requests
import json
import pandas as pd
from pprint import pprint
import pickle

base_url = "https://api.openalex.org/"
email = "katz.562@osu.edu"


def pickle_dump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)


author_list = []
for j in range(10):
    try:
        author_list.extend(pickle.load(
            open(f"data/author_list_{j}.pkl", 'rb')))
        print(f"Author list {j} loaded from file.")
    except FileNotFoundError:
        sample = 10000
        per_page = 200
        while j < 10:
            new_author_list = []
            print(f"{j}th sample")
            seed = 3142+j
            i = 1
            while i <= sample/per_page:
                print(f"Page {i}")
                response = requests.get(base_url + "authors",
                                        params={'sample': sample, 'seed': seed, 'per-page': per_page, 'page': i})
                data = response.json()
                if len(data['results']) == 0:
                    break
                new_author_list.extend(data['results'])
                i += 1
            filename = f"data/author_list_{j}.json"
            print(f'Saving {len(new_author_list)} authors to {filename}')
            pickle_dump(new_author_list, f"data/author_list_{j}.pkl")
            j += 1

Author list 0 loaded from file.
Author list 1 loaded from file.
Author list 2 loaded from file.
Author list 3 loaded from file.
Author list 4 loaded from file.
Author list 5 loaded from file.
Author list 6 loaded from file.
Author list 7 loaded from file.
Author list 8 loaded from file.
Author list 9 loaded from file.


In [None]:
authors_df = pd.DataFrame(author_list)

In [None]:

# unnecessary_columns = ['orcid', 'ids', 'affiliations', 'last_known_institutions',
#    'summary_stats', 'created_date', 'updated_date', 'topics', 'topic_share', 'relevance_score']
df = pd.DataFrame(authors_df['display_name'])

In [None]:
def fetch_works(api_url):
    params = {
        'per_page': 200
    }
    response = requests.get(api_url, params)
    if response.status_code == 200:
        return [work['id'] for work in response.json()['results']]
    else:
        return None


n = 10  # Number of rows to process
df.loc[:n-1, 'works'] = df.loc[:n-1, 'works_api_url'].apply(fetch_works)

In [None]:
import regex as re


def get_first_name(display_name):
    first_name = r'([A-Z][a-z]+)'
    match = re.search(first_name, display_name)
    if match:
        return match.group(1)
    else:
        return None


def get_surname(display_name):
    match = re.search(r'\s([A-Z][a-z]+)$', display_name)
    if match:
        return match.group(1)
    else:
        return None


df['surname'] = df['display_name'].apply(get_surname)
df['first_name'] = df['display_name'].apply(get_first_name)

In [None]:
# import regex as re

# # Define the regex pattern
# pattern = r'^(?!.*\b(?:[A-Z]\.?\s?){1,3}(?:[a-z]+\s)*[A-Z][\p{L}]*\b).+$'


# # Filter the DataFrame using regex with a lambda function
# filtered_df = df[df['display_name'].apply(
#     lambda x: re.match(pattern, x) is not None)]

# # Display the filtered DataFrame
# filtered_df.head(100)

In [None]:
import re
df['strip_display_name'] = df['display_name'].str.replace(
    '\.\s?', ' ', regex=True)

# Filter the DataFrame
filtered_df = df[~df['strip_display_name'].str.match(r'^\b\w{1,2}\b ')]

# Split 'strip_display_name' into 'first' and 'last' using .loc to avoid SettingWithCopyWarning
filtered_df.loc[:,
                'first'] = filtered_df['strip_display_name'].str.split().str[0]
filtered_df.loc[:,
                'last'] = filtered_df['strip_display_name'].str.split().str[-1]
filtered_df.head(100)

In [None]:
# from chicksexer import predict_gender
import gender_ai as g
import gender_guesser.detector as gender_guesser
d = gender_guesser.Detector()


gdf = filtered_df.copy().head(100)
gdf['gender_guesser'] = gdf['first'].apply(lambda x: d.get_gender(x))
gdf['gender_ai'] = gdf['first'].apply(lambda x: g.predict(x))
# gdf['chicksexer'] = gdf['first'].apply(lambda x: predict_gender(x))

In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_gender_prediction(input_string):
    response = client.responses.create(
        model='gpt-3.5-turbo',
        instructions='You are a helpful assistant that determines the most likely gender of a person with specified name: responding only with "Male", "Female", or "Unknown".',
        input=f"{input_string}."
    )
    # Extract the response text
    gender = response.output_text
    return gender

In [None]:
gdf['gpt2'] = gdf['display_name'].apply(lambda x: get_gender_prediction(x))

In [None]:
gdf = pickle.load(open('data/gender_predictions_1000_2000.pkl', 'rb'))

In [7]:
gdf.head(100)

Unnamed: 0,id,orcid,display_name,display_name_alternatives,relevance_score,works_count,cited_by_count,summary_stats,ids,affiliations,last_known_institutions,topics,topic_share,x_concepts,counts_by_year,works_api_url,updated_date,created_date,strip_display_name,probable_gender
0,https://openalex.org/A5092356489,,Resty Fastabikul Khaerat,[Resty Fastabikul Khaerat],1.000000,2,1,"{'2yr_mean_citedness': 0.5, 'h_index': 1, 'i10...",{'openalex': 'https://openalex.org/A5092356489'},[{'institution': {'id': 'https://openalex.org/...,"[{'id': 'https://openalex.org/I98301867', 'ror...","[{'id': 'https://openalex.org/T11555', 'displa...","[{'id': 'https://openalex.org/T14179', 'displa...","[{'id': 'https://openalex.org/C15708023', 'wik...","[{'year': 2024, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2025-02-25T04:01:28.507472,2023-09-07,Resty Fastabikul Khaerat,Male
2,https://openalex.org/A5114128731,,Xinshun Li,[Xinshun Li],1.000000,2,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5114128731'},[{'institution': {'id': 'https://openalex.org/...,"[{'id': 'https://openalex.org/I140221134', 'ro...","[{'id': 'https://openalex.org/T10016', 'displa...","[{'id': 'https://openalex.org/T10016', 'displa...","[{'id': 'https://openalex.org/C6557445', 'wiki...","[{'year': 2024, 'works_count': 2, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2025-02-16T12:33:43.136221,2024-10-15,Xinshun Li,Male
8,https://openalex.org/A5004616759,,堀越さな恵,[堀越さな恵],1.000000,1,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5004616759'},[],[],[],[],"[{'id': 'https://openalex.org/C41008148', 'wik...","[{'year': 2013, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2024-05-09T20:27:16.820700,2023-07-21,堀越さな恵,Female
9,https://openalex.org/A5068802092,,Debbie Northin,[Debbie Northin],1.000000,1,3,"{'2yr_mean_citedness': 0.0, 'h_index': 1, 'i10...",{'openalex': 'https://openalex.org/A5068802092'},[],[],"[{'id': 'https://openalex.org/T10776', 'displa...","[{'id': 'https://openalex.org/T10416', 'displa...","[{'id': 'https://openalex.org/C1862650', 'wiki...",[],https://api.openalex.org/works?filter=author.i...,2025-02-16T02:00:32.515759,2023-07-21,Debbie Northin,Female
10,https://openalex.org/A5095940583,,Ambrish,[Ambrish],1.000000,1,4,"{'2yr_mean_citedness': 0.0, 'h_index': 1, 'i10...",{'openalex': 'https://openalex.org/A5095940583'},[],[],"[{'id': 'https://openalex.org/T10930', 'displa...","[{'id': 'https://openalex.org/T11119', 'displa...","[{'id': 'https://openalex.org/C7251660', 'wiki...","[{'year': 2025, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2025-03-05T09:24:56.362811,2024-04-30,Ambrish,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,https://openalex.org/A5004771407,,付权茂,[付权茂],0.999999,1,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5004771407'},[],[],[],[],"[{'id': 'https://openalex.org/C41008148', 'wik...",[],https://api.openalex.org/works?filter=author.i...,2024-05-13T09:36:04.646154,2023-07-21,付权茂,Unknown
139,https://openalex.org/A5107510409,,Jawharah Fahad Abdul Aziz Al-Fehaid,[Jawharah Fahad Abdul Aziz Al‐Fehaid],0.999999,1,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5107510409'},[],[],"[{'id': 'https://openalex.org/T14449', 'displa...","[{'id': 'https://openalex.org/T14060', 'displa...","[{'id': 'https://openalex.org/C2524010', 'wiki...","[{'year': 2023, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2025-02-23T10:08:12.171910,2024-09-28,Jawharah Fahad Abdul Aziz Al-Fehaid,Female
140,https://openalex.org/A5027963859,,Steven Tschantzk,[Steven Tschantzk],0.999999,1,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5027963859'},[{'institution': {'id': 'https://openalex.org/...,"[{'id': 'https://openalex.org/I200719446', 'ro...","[{'id': 'https://openalex.org/T11567', 'displa...","[{'id': 'https://openalex.org/T10948', 'displa...","[{'id': 'https://openalex.org/C2524010', 'wiki...",[],https://api.openalex.org/works?filter=author.i...,2025-02-16T22:51:48.309917,2023-07-21,Steven Tschantzk,Male
143,https://openalex.org/A5093963728,,Ερμής Γκίνης,[Ερμής Γκίνης],0.999999,1,0,"{'2yr_mean_citedness': 0.0, 'h_index': 0, 'i10...",{'openalex': 'https://openalex.org/A5093963728'},[],[],"[{'id': 'https://openalex.org/T13068', 'displa...","[{'id': 'https://openalex.org/T13068', 'displa...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2024, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2025-01-28T01:18:56.697319,2024-02-21,Ερμής Γκίνης,Male


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

open_alex_api_key = os.getenv("OPEN_ALEX_API_KEY")
base_url = "https://api.openalex.org/"
authors_path = "data/author_list.json"
publications_path = "data/author_publications.json"


def fetch_authors():
    """Fetches authors from OpenAlex using the API key"""
    url = f"{base_url}authors?per_page=200&api_key={open_alex_api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        authors = response.json().get("results", [])
        with open(authors_path, "w") as f:
            json.dump(authors, f, indent=4)
        return authors
    else:
        print(f"Error fetching authors: {response.status_code}")
        return []


try:
    with open(authors_path, 'r', encoding='utf-8') as f:
        authors = json.load(f)
    print("Author list loaded from file.")
except FileNotFoundError:
    print("Fetching authors from OpenAlex...")
    authors = fetch_authors()
df_authors = pd.DataFrame(authors)


def fetch_publications(author_id):
    """Fetches publications for a given author"""
    url = f"{base_url}works?filter=author.id:{author_id}&per_page=200&api_key={open_alex_api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        works = response.json().get("results", [])
        return [{"author_id": author_id, "work_id": work["id"], "is_oa": work["open_access"]["is_oa"], "concepts": work["concepts"]} for work in works]
    else:
        print(f"Error fetching works for {author_id}: {response.status_code}")
        return []


try:
    with open(publications_path, 'r', encoding='utf-8') as f:
        publications = json.load(f)
    print(" Publications loaded from file.")
except FileNotFoundError:
    print("Fetching publications for all authors...")
    publications = []
    chunk_size = 100  # Define the chunk size
    for i in range(0, len(authors), chunk_size):
        chunk = authors[i:i + chunk_size]
        for author in chunk:
            author_id = author["id"].split("/")[-1]
            publications.extend(fetch_publications(author_id))
        # Save progress periodically
        with open(publications_path, "w") as f:
            json.dump(publications, f, indent=4)
        print(f"Saved {len(publications)} publications so far.")
df_works = pd.DataFrame(publications)
print(f" Authors: {len(df_authors)} records")
print(f" Publications: {len(df_works)} records")
df_works['author_id'] = df_works['authorships'].apply(
    lambda x: x[0]['author']['id'] if isinstance(x, list) and x else "Unknown")
df_works['is_oa'] = df_works['open_access'].apply(
    lambda x: x.get('is_oa', False) if isinstance(x, dict) else False)
df_works['concepts'] = df_works['concepts'].apply(
    lambda x: x[0]['display_name'] if isinstance(x, list) and x else "Unknown")

Author list loaded from file.
Fetching publications for all authors...
Saved 905 publications so far.
Saved 1575 publications so far.
Saved 2139 publications so far.
Saved 2733 publications so far.
Saved 3473 publications so far.
Saved 4453 publications so far.
Saved 5018 publications so far.
Saved 5658 publications so far.
Saved 6522 publications so far.
Saved 7628 publications so far.
Saved 8304 publications so far.
Saved 9046 publications so far.
Saved 9745 publications so far.
Saved 10506 publications so far.
Saved 11626 publications so far.
Saved 12068 publications so far.
Saved 13158 publications so far.
Saved 13617 publications so far.
Saved 14705 publications so far.
Saved 15236 publications so far.
Saved 16002 publications so far.
Saved 17080 publications so far.
Saved 18451 publications so far.
Saved 19749 publications so far.
Saved 20366 publications so far.
Saved 20941 publications so far.
Saved 21707 publications so far.
Saved 22201 publications so far.
Saved 23100 publica