# Gender Gap in Computational Biology: Gender Inference
## Notebook 3 - Inferring Gender from Author First Names

This notebook applies gender inference to author first names using a layered strategy:

**Layer 1:** gender-guesser (offline, ~45k names)  
**Layer 2:** genderize.io API (fallback for unknowns)

All lookups are cached to avoid redundant API calls.

**Output:** P_female (probability of female authorship) for each author in each position

In [None]:
import os
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
from src.gender_utils import GenderInference, assign_positions
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

# Load environment variables from .env file
load_dotenv()

## 1. Initialize Gender Inference Engine

In [None]:
gi = GenderInference(cache_path='../data/gender_cache.json')
print(f"Loaded {len(gi.cache)} cached gender lookups")

## 2. Load PubMed Data

In [None]:
# Load Biology and Computational Biology datasets
bio_df = pd.read_csv('../data/processed/pubmed_biology_2015_2024.csv')
comp_df = pd.read_csv('../data/processed/pubmed_compbio_2015_2024.csv')

print(f"Loaded {len(bio_df)} Biology papers")
print(f"Loaded {len(comp_df)} Computational Biology papers")

# Add dataset identifier
bio_df['dataset'] = 'Biology'
comp_df['dataset'] = 'Computational Biology'

# Combine datasets
pubmed_df = pd.concat([bio_df, comp_df], ignore_index=True)
print(f"\nCombined: {len(pubmed_df)} total papers")

## 3. Expand Author-Position Data

Convert from paper-level data to author-position-level data.

In [None]:
def expand_author_positions(df):
    """
    Convert from paper-level to author-position-level.
    
    Each row becomes a (paper, author, position) tuple.
    """
    rows = []
    
    for idx, row in df.iterrows():
        positions = eval(row['positions']) if isinstance(row['positions'], str) else row['positions']
        
        for author, position in positions:
            rows.append({
                'pmid': row['pmid'],
                'year': row['year'],
                'dataset': row['dataset'],
                'author': author,
                'position': position
            })
    
    return pd.DataFrame(rows)

print("Expanding to author-position level...")
pubmed_author_df = expand_author_positions(pubmed_df)
print(f"Total author-position records: {len(pubmed_author_df)}")

## 4. Infer Gender for All Authors

In [None]:
print("Inferring gender for all PubMed authors...")
print("This may take a while and will use genderize.io API for unknown names.")

p_female_list = []
gender_sources = []

unique_authors = pubmed_author_df['author'].unique()
author_to_gender = {}

for author in tqdm(unique_authors, desc="Inferring gender"):
    # Extract first name (assuming "FirstName LastName" format)
    first_name = author.split()[0] if author else ''
    
    result = gi.infer_gender(first_name)
    author_to_gender[author] = result

# Map results back to dataframe
pubmed_author_df['p_female'] = pubmed_author_df['author'].map(
    lambda x: author_to_gender[x]['probability']
)
pubmed_author_df['gender'] = pubmed_author_df['author'].map(
    lambda x: author_to_gender[x]['gender']
)
pubmed_author_df['source'] = pubmed_author_df['author'].map(
    lambda x: author_to_gender[x]['source']
)

print("\nGender inference complete!")

## 5. Load and Process arXiv Data

In [None]:
# Load arXiv datasets
qbio_df = pd.read_csv('../data/processed/arxiv_qbio_2015_2024.csv')
cs_df = pd.read_csv('../data/processed/arxiv_cs_2015_2024.csv')

print(f"Loaded {len(qbio_df)} q-bio preprints")
print(f"Loaded {len(cs_df)} cs preprints")

# Add dataset identifier
qbio_df['dataset'] = 'q-bio'
cs_df['dataset'] = 'cs'

# Combine datasets
arxiv_df = pd.concat([qbio_df, cs_df], ignore_index=True)
print(f"\nCombined: {len(arxiv_df)} total preprints")

In [None]:
print("Expanding arXiv data to author-position level...")
arxiv_author_df = expand_author_positions(arxiv_df)
print(f"Total author-position records: {len(arxiv_author_df)}")

print("\nInferring gender for all arXiv authors...")
unique_authors = arxiv_author_df['author'].unique()

for author in tqdm(unique_authors, desc="Inferring gender"):
    if author not in author_to_gender:
        first_name = author.split()[0] if author else ''
        result = gi.infer_gender(first_name)
        author_to_gender[author] = result

# Map results back to dataframe
arxiv_author_df['p_female'] = arxiv_author_df['author'].map(
    lambda x: author_to_gender[x]['probability']
)
arxiv_author_df['gender'] = arxiv_author_df['author'].map(
    lambda x: author_to_gender[x]['gender']
)
arxiv_author_df['source'] = arxiv_author_df['author'].map(
    lambda x: author_to_gender[x]['source']
)

print("\narXiv gender inference complete!")

## 6. Save Gender Cache

In [None]:
print(f"Saving gender cache ({len(gi.cache)} entries)...")
gi.save_cache()
print(f"Cache saved to {gi.cache_path}")

## 7. Save Processed Datasets

In [None]:
# Save PubMed processed data
pubmed_output = '../data/processed/pubmed_authors_with_gender.csv'
pubmed_author_df.to_csv(pubmed_output, index=False)
print(f"Saved PubMed authors to {pubmed_output}")

# Save arXiv processed data
arxiv_output = '../data/processed/arxiv_authors_with_gender.csv'
arxiv_author_df.to_csv(arxiv_output, index=False)
print(f"Saved arXiv authors to {arxiv_output}")

## 8. Summary Statistics

In [None]:
def print_gender_summary(df, name):
    print(f"\n{name}:")
    print(f"  Total authors: {len(df)}")
    print(f"  Resolved: {df['gender'] != 'unknown'}.sum()} ({100*df['gender'] != 'unknown'.mean():.1f}%)")
    print(f"  Unknown: {(df['gender'] == 'unknown').sum()} ({100*(df['gender'] == 'unknown').mean():.1f}%)")
    print(f"\n  By Gender Inference Source:")
    print(df['source'].value_counts())
    print(f"\n  P_female by Position:")
    print(df.groupby('position')['p_female'].agg(['count', 'mean', 'std']).round(3))

print_gender_summary(pubmed_author_df, "PubMed Authors")
print_gender_summary(arxiv_author_df, "arXiv Authors")