# Gender Gap in Computational Biology: Publication Figures
## Notebook 5 - Generate Publication-Ready Figures

This notebook generates all figures for the BWIB Deep Dive blog post and LinkedIn article.

**Figures to generate:**
1. P_female by author position (Bar chart)
2. P_female over time (Line chart with confidence intervals)
3. Female PI effect (Male vs. Female last authors)
4. arXiv comparison (q-bio vs. cs)
5. COVID-19 impact (Year-by-year comparison)
6. Subfield comparison (if MeSH subfield data available)

All figures exported at 300 DPI for print and as SVG/PNG for web.

In [None]:
import os
import sys
sys.path.insert(0, '..')

from dotenv import load_dotenv
from src.plotting import (
    plot_pfemale_by_position,
    plot_pfemale_over_time,
    plot_female_pi_effect,
    plot_interactive_temporal_trend
)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables from .env file
load_dotenv()

# Set style
sns.set_style("whitegrid")
sns.set_palette("husl")

## 1. Load Analysis Results

In [None]:
# Load analysis results
position_results = pd.read_csv('../data/processed/analysis_position_breakdown.csv')
temporal_results = pd.read_csv('../data/processed/analysis_temporal_trend.csv')
arxiv_position_results = pd.read_csv('../data/processed/analysis_arxiv_position.csv')
covid_results = pd.read_csv('../data/processed/analysis_covid_impact.csv')

print("Loaded analysis results")
print(f"Position breakdown: {len(position_results)} rows")
print(f"Temporal trends: {len(temporal_results)} rows")
print(f"arXiv position: {len(arxiv_position_results)} rows")
print(f"COVID impact: {len(covid_results)} rows")

## 2. Create Output Directory

In [None]:
output_dir = '../outputs/figures'
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")

## 3. Figure 1: P_female by Author Position

In [None]:
print("\nGenerating Figure 1: P_female by Author Position")

fig, ax = plot_pfemale_by_position(
    position_results,
    group_col='dataset',
    output_path=f'{output_dir}/fig1_position_breakdown.png',
    figsize=(10, 6)
)
plt.savefig(f'{output_dir}/fig1_position_breakdown.svg', dpi=300, bbox_inches='tight', format='svg')
plt.close()

print(f"Saved Figure 1 to {output_dir}/fig1_position_breakdown.*")

## 4. Figure 2: P_female Over Time (Temporal Trend)

In [None]:
print("\nGenerating Figure 2: Temporal Trend")

fig, ax = plot_pfemale_over_time(
    temporal_results,
    group_col='dataset',
    output_path=f'{output_dir}/fig2_temporal_trend.png',
    figsize=(12, 6)
)
plt.savefig(f'{output_dir}/fig2_temporal_trend.svg', dpi=300, bbox_inches='tight', format='svg')
plt.close()

print(f"Saved Figure 2 to {output_dir}/fig2_temporal_trend.*")

## 5. Figure 3: Female PI Effect

In [None]:
print("\nGenerating Figure 3: Female PI Effect")

# Load female PI effect results (generated in notebook 04)
try:
    pi_effect_results = pd.read_csv('../data/processed/analysis_female_pi_effect.csv')
    
    fig, ax = plot_female_pi_effect(
        pi_effect_results,
        output_path=f'{output_dir}/fig3_female_pi_effect.png',
        figsize=(10, 6)
    )
    plt.savefig(f'{output_dir}/fig3_female_pi_effect.svg', dpi=300, bbox_inches='tight', format='svg')
    plt.close()
    
    print(f"Saved Figure 3 to {output_dir}/fig3_female_pi_effect.*")
except FileNotFoundError:
    print("Note: Female PI effect results not found. Run notebook 04 to generate.")

## 6. Figure 4: arXiv Comparison (q-bio vs. cs)

In [None]:
print("\nGenerating Figure 4: arXiv Comparison")

fig, ax = plot_pfemale_by_position(
    arxiv_position_results,
    group_col='dataset',
    output_path=f'{output_dir}/fig4_arxiv_comparison.png',
    figsize=(10, 6)
)
plt.savefig(f'{output_dir}/fig4_arxiv_comparison.svg', dpi=300, bbox_inches='tight', format='svg')
plt.close()

print(f"Saved Figure 4 to {output_dir}/fig4_arxiv_comparison.*")

## 7. Figure 5: COVID-19 Impact

In [None]:
print("\nGenerating Figure 5: COVID-19 Impact")

# Create custom COVID figure
fig, ax = plt.subplots(figsize=(10, 6))

# Plot bars for each period
periods = covid_results['period'].tolist()
means = covid_results['mean'].tolist()
ci_lower = covid_results['ci_lower'].tolist()
ci_upper = covid_results['ci_upper'].tolist()

x_pos = range(len(periods))
errors = [
    [m - l for m, l in zip(means, ci_lower)],
    [u - m for u, m in zip(ci_upper, means)]
]

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, Orange, Green
ax.bar(x_pos, means, yerr=errors, capsize=5, color=colors, alpha=0.8)

ax.set_xticks(x_pos)
ax.set_xticklabels(periods, fontsize=11)
ax.set_ylabel('P(Female)', fontsize=12, fontweight='bold')
ax.set_ylim([0, 1])
ax.set_title('Female Authorship: COVID-19 Impact', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{output_dir}/fig5_covid_impact.png', dpi=300, bbox_inches='tight')
plt.savefig(f'{output_dir}/fig5_covid_impact.svg', dpi=300, bbox_inches='tight', format='svg')
plt.close()

print(f"Saved Figure 5 to {output_dir}/fig5_covid_impact.*")

## 8. Interactive Figure: Temporal Trend (Plotly)

In [None]:
print("\nGenerating Interactive Figure: Temporal Trend")

fig = plot_interactive_temporal_trend(
    temporal_results,
    group_col='dataset',
    output_path=f'{output_dir}/interactive_temporal_trend.html'
)

print(f"Saved interactive figure to {output_dir}/interactive_temporal_trend.html")

## 9. Summary of Generated Figures

In [None]:
import glob

print("\n" + "="*60)
print("SUMMARY OF GENERATED FIGURES")
print("="*60)

figures = glob.glob(f'{output_dir}/*')
figures.sort()

print(f"\nGenerated {len(figures)} figure files:")
for fig in figures:
    size = os.path.getsize(fig) / 1024  # KB
    print(f"  - {os.path.basename(fig)} ({size:.1f} KB)")

print(f"\nAll figures saved to: {os.path.abspath(output_dir)}")
print("\nReady for blog post and social media!")
print("="*60)