# Gender Gap in Computational Biology: Statistical Analysis
## Notebook 4 - Bootstrap Analysis and Key Analyses

This notebook performs bootstrap resampling to estimate P_female with confidence intervals for key groupings:

1. By author position (first, second, other, penultimate, last)
2. Over time (year-by-year trends)
3. Female PI effect (male vs. female last authors)
4. arXiv comparison (q-bio vs. cs)
5. COVID-19 impact analysis

In [None]:
import os
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
from src.bootstrap import (
    bootstrap_pfemale,
    bootstrap_by_group,
    bootstrap_by_multiple_groups,
    calculate_trend
)
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load environment variables from .env file
load_dotenv()

## 1. Load Processed Data

In [None]:
pubmed_df = pd.read_csv('../data/processed/pubmed_authors_with_gender.csv')
arxiv_df = pd.read_csv('../data/processed/arxiv_authors_with_gender.csv')

print(f"Loaded {len(pubmed_df)} PubMed author records")
print(f"Loaded {len(arxiv_df)} arXiv author records")

# Combine for overall stats
all_df = pd.concat([pubmed_df, arxiv_df], ignore_index=True)
print(f"\nTotal author records: {len(all_df)}")

## 2. Analysis 1: P_female by Author Position (Biology vs. Comp Bio, 2015-2024)

In [None]:
print("\n=== ANALYSIS 1: P_female by Author Position ===")

# Filter PubMed data (2015-2024)
pubmed_2015_2024 = pubmed_df[(pubmed_df['year'] >= 2015) & (pubmed_df['year'] <= 2024)]

# Bootstrap by position and dataset
position_results = bootstrap_by_multiple_groups(
    pubmed_2015_2024,
    group_cols=['dataset', 'position'],
    prob_col='p_female',
    n_iterations=1000
)

print("\nP_female by Position (2015-2024):")
print(position_results.to_string())

# Save results
position_results.to_csv('../data/processed/analysis_position_breakdown.csv', index=False)
print("\nSaved to analysis_position_breakdown.csv")

## 3. Analysis 2: Temporal Trends (P_female Over Time)

In [None]:
print("\n=== ANALYSIS 2: Temporal Trends ===")

# Bootstrap by year and dataset for overall P_female
temporal_results = bootstrap_by_multiple_groups(
    pubmed_2015_2024,
    group_cols=['dataset', 'year'],
    prob_col='p_female',
    n_iterations=1000
)

print("\nP_female by Year:")
print(temporal_results.sort_values(['dataset', 'year']).to_string())

# Calculate trends
for dataset in ['Biology', 'Computational Biology']:
    dataset_df = temporal_results[temporal_results['dataset'] == dataset]
    slope, intercept = calculate_trend(dataset_df, 'year', 'mean')
    if slope:
        print(f"\n{dataset}: {slope:.4f} percentage points per year")

# Save results
temporal_results.to_csv('../data/processed/analysis_temporal_trend.csv', index=False)
print("\nSaved to analysis_temporal_trend.csv")

## 4. Analysis 3: Female PI Effect

Compare P_female by position for papers with male vs. female last authors.

In [None]:
print("\n=== ANALYSIS 3: Female PI Effect ===")

# Need to map papers back to author positions
# Load original paper data to get last author gender
bio_papers = pd.read_csv('../data/processed/pubmed_biology_2015_2024.csv')
comp_papers = pd.read_csv('../data/processed/pubmed_compbio_2015_2024.csv')
pubmed_papers = pd.concat([bio_papers, comp_papers], ignore_index=True)

# Extract last author gender for each paper
def get_last_author_gender(row):
    positions = eval(row['positions']) if isinstance(row['positions'], str) else row['positions']
    if positions:
        last_author = positions[-1][0]  # Last author name
        # Look up in pubmed_df
        gender_row = pubmed_df[pubmed_df['author'] == last_author]
        if not gender_row.empty:
            return gender_row.iloc[0]['gender']
    return None

pubmed_papers['last_author_gender'] = pubmed_papers.apply(get_last_author_gender, axis=1)

# Expand to author level and merge last author gender
# TODO: Implement full female PI effect analysis
print("Female PI effect analysis implementation needed")

## 5. Analysis 4: arXiv Comparison (q-bio vs. cs)

In [None]:
print("\n=== ANALYSIS 4: arXiv Comparison ===")

# Bootstrap by position and category
arxiv_position_results = bootstrap_by_multiple_groups(
    arxiv_df,
    group_cols=['dataset', 'position'],
    prob_col='p_female',
    n_iterations=1000
)

print("\nP_female by Position (arXiv, 2015-2024):")
print(arxiv_position_results.to_string())

# Save results
arxiv_position_results.to_csv('../data/processed/analysis_arxiv_position.csv', index=False)
print("\nSaved to analysis_arxiv_position.csv")

## 6. Analysis 5: COVID-19 Impact

In [None]:
print("\n=== ANALYSIS 5: COVID-19 Impact ===")

# Compare pre-pandemic, pandemic, and post-pandemic periods
periods = {
    'Pre-COVID (2018-2019)': (2018, 2019),
    'Pandemic (2020-2021)': (2020, 2021),
    'Recovery (2022-2023)': (2022, 2023)
}

covid_results = []

for period_name, (start_year, end_year) in periods.items():
    period_df = pubmed_2015_2024[(pubmed_2015_2024['year'] >= start_year) & (pubmed_2015_2024['year'] <= end_year)]
    
    # Overall P_female
    mean, ci_lower, ci_upper = bootstrap_pfemale(period_df['p_female'].tolist())
    
    covid_results.append({
        'period': period_name,
        'mean': mean,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'n': len(period_df)
    })

covid_df = pd.DataFrame(covid_results)
print("\nP_female by Period:")
print(covid_df.to_string())

# Save results
covid_df.to_csv('../data/processed/analysis_covid_impact.csv', index=False)
print("\nSaved to analysis_covid_impact.csv")

## 7. Summary of All Analyses

In [None]:
print("\n" + "="*60)
print("SUMMARY OF ANALYSES")
print("="*60)

print("\n1. Position Breakdown (PubMed 2015-2024):")
print(position_results.to_string())

print("\n2. arXiv Position Breakdown (2015-2024):")
print(arxiv_position_results.to_string())

print("\n3. COVID-19 Impact Analysis:")
print(covid_df.to_string())

print("\n" + "="*60)
print("All analysis results saved to data/processed/")
print("="*60)