# Gender Gap in Computational Biology: arXiv Data Collection
## Notebook 2 - Fetching arXiv Data

This notebook fetches quantitative biology (q-bio) and computer science (cs) preprints from arXiv for 2015-2024.

**Key Steps:**
1. Search arXiv for q-bio and cs preprints
2. Extract author information from arXiv metadata
3. Parse author names and extract first names for gender inference
4. Assign author positions
5. Save results to CSV for downstream processing

In [None]:
import os
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
from src.arxiv_fetcher import arXivFetcher
from src.gender_utils import assign_positions
import pandas as pd
from tqdm import tqdm

# Load environment variables from .env file
load_dotenv()

## 1. Initialize arXiv Fetcher

In [None]:
# Load arXiv delay setting from .env (default 3.0 seconds)
delay_seconds = float(os.getenv("ARXIV_DELAY_SECONDS", "3.0"))
fetcher = arXivFetcher(delay_seconds=delay_seconds)
print(f"Initialized arXiv fetcher with {delay_seconds}s delay between requests")

## 2. Fetch Quantitative Biology (q-bio) Preprints (2015-2024)

In [None]:
print("Fetching quantitative biology (q-bio) preprints (2015-2024)...")
print("This may take several minutes due to API rate limiting.")
qbio_preprints = fetcher.fetch_quantitative_biology(start_year=2015, end_year=2024)
print(f"Fetched {len(qbio_preprints)} q-bio preprints")

## 3. Fetch Computer Science (cs) Preprints (2015-2024)

In [None]:
print("Fetching computer science (cs) preprints (2015-2024)...")
print("This may take several minutes due to API rate limiting.")
cs_preprints = fetcher.fetch_computer_science(start_year=2015, end_year=2024)
print(f"Fetched {len(cs_preprints)} cs preprints")

## 4. Assign Author Positions

In [None]:
def add_author_positions(preprints: list) -> list:
    """
    Add author position information to preprints.
    
    Each preprint will have a 'positions' field containing list of
    (author_name, position) tuples.
    """
    for preprint in preprints:
        authors = preprint.get('authors', [])
        preprint['positions'] = assign_positions(authors)
    return preprints

print("Adding author positions to q-bio preprints...")
qbio_preprints = add_author_positions(qbio_preprints)

print("Adding author positions to cs preprints...")
cs_preprints = add_author_positions(cs_preprints)

print("Done!")

## 5. Save to CSV

In [None]:
# Create data directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save q-bio preprints
qbio_csv_path = '../data/processed/arxiv_qbio_2015_2024.csv'
fetcher.save_to_csv(qbio_preprints, qbio_csv_path)
print(f"Saved q-bio preprints to {qbio_csv_path}")

# Save cs preprints
cs_csv_path = '../data/processed/arxiv_cs_2015_2024.csv'
fetcher.save_to_csv(cs_preprints, cs_csv_path)
print(f"Saved cs preprints to {cs_csv_path}")

## 6. Summary Statistics

In [None]:
qbio_df = pd.read_csv(qbio_csv_path)
cs_df = pd.read_csv(cs_csv_path)

print("Quantitative Biology (q-bio) Dataset:")
print(f"  Total preprints: {len(qbio_df)}")
print(f"  Year range: {qbio_df['year'].min():.0f} - {qbio_df['year'].max():.0f}")
print(f"  Average authors per preprint: {qbio_df['author_count'].mean():.1f}")

print("\nComputer Science (cs) Dataset:")
print(f"  Total preprints: {len(cs_df)}")
print(f"  Year range: {cs_df['year'].min():.0f} - {cs_df['year'].max():.0f}")
print(f"  Average authors per preprint: {cs_df['author_count'].mean():.1f}")