# Gender Gap in Computational Biology: PubMed Data Collection
## Notebook 1 - Fetching PubMed Data

This notebook fetches Biology and Computational Biology papers from PubMed for 2015-2024 (extending the original 1997-2014 dataset).

**Key Steps:**
1. Search PubMed for Biology and Computational Biology papers
2. Extract author information (first names, last names)
3. Assign author positions (first, second, other, penultimate, last)
4. Save results to CSV for downstream processing

In [None]:
import os
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
from src.pubmed_fetcher import PubMedFetcher
from src.gender_utils import assign_positions
import pandas as pd
from tqdm import tqdm

# Load environment variables from .env file
load_dotenv()

## 1. Initialize PubMed Fetcher

Set your email and NCBI API key (stored in environment variable `NCBI_API_KEY`)

In [None]:
# Load email and API key from .env file
email = os.getenv("NCBI_EMAIL")
if not email:
    raise ValueError("NCBI_EMAIL not found in .env file. Please copy .env.example to .env and add your email.")

fetcher = PubMedFetcher(email=email)
print(f"Initialized PubMedFetcher with email: {email}")
print(f"API key loaded: {fetcher.api_key is not None}")

if not fetcher.api_key:
    print("⚠️  Warning: NCBI_API_KEY not set. Requests will be limited to 3/second.")
    print("    To increase to 10/second, add NCBI_API_KEY to .env file.")

## 2. Search for Biology Papers (2015-2024)

In [None]:
print("Searching for Biology papers (2015-2024)...")
bio_pmids = fetcher.search_biology(start_year=2015, end_year=2024)
print(f"Found {len(bio_pmids)} Biology papers")

## 3. Search for Computational Biology Papers (2015-2024)

In [None]:
print("Searching for Computational Biology papers (2015-2024)...")
comp_pmids = fetcher.search_computational_biology(start_year=2015, end_year=2024)
print(f"Found {len(comp_pmids)} Computational Biology papers")

## 4. Fetch Detailed Paper Information

In [None]:
print("Fetching Biology paper details...")
bio_papers = fetcher.fetch_paper_details(bio_pmids, batch_size=500)
print(f"Fetched {len(bio_papers)} Biology papers")
print("\nSample Biology paper:")
if bio_papers:
    print(bio_papers[0])

In [None]:
print("Fetching Computational Biology paper details...")
comp_papers = fetcher.fetch_paper_details(comp_pmids, batch_size=500)
print(f"Fetched {len(comp_papers)} Computational Biology papers")

## 5. Assign Author Positions

In [None]:
def add_author_positions(papers: list) -> list:
    """
    Add author position information to papers.
    
    Each paper will have a 'positions' field containing list of
    (author_name, position) tuples.
    """
    for paper in papers:
        authors = paper.get('authors', [])
        paper['positions'] = assign_positions(authors)
    return papers

print("Adding author positions to Biology papers...")
bio_papers = add_author_positions(bio_papers)

print("Adding author positions to Computational Biology papers...")
comp_papers = add_author_positions(comp_papers)

print("Done!")

## 6. Convert to DataFrames and Save to CSV

In [None]:
# Create data directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save Biology papers
bio_csv_path = '../data/processed/pubmed_biology_2015_2024.csv'
fetcher.save_to_csv(bio_papers, bio_csv_path)
print(f"Saved Biology papers to {bio_csv_path}")

# Save Computational Biology papers
comp_csv_path = '../data/processed/pubmed_compbio_2015_2024.csv'
fetcher.save_to_csv(comp_papers, comp_csv_path)
print(f"Saved Computational Biology papers to {comp_csv_path}")

## 7. Summary Statistics

In [None]:
bio_df = pd.read_csv(bio_csv_path)
comp_df = pd.read_csv(comp_csv_path)

print("Biology Dataset:")
print(f"  Total papers: {len(bio_df)}")
print(f"  Year range: {bio_df['year'].min():.0f} - {bio_df['year'].max():.0f}")
print(f"  Average authors per paper: {bio_df['author_count'].mean():.1f}")

print("\nComputational Biology Dataset:")
print(f"  Total papers: {len(comp_df)}")
print(f"  Year range: {comp_df['year'].min():.0f} - {comp_df['year'].max():.0f}")
print(f"  Average authors per paper: {comp_df['author_count'].mean():.1f}")