# 1.5 Promiscuity-Controlled Random Gene Sampling

Generate random gene samples controlling for gene promiscuity (number of GO term annotations).

## Inputs
- `output/intermediate/hetio_bppg_dataset2_filtered.csv` (2016 filtered)
- `output/intermediate/hetio_bppg_dataset2_2024_filtered.csv` (2024 filtered)
- `input/BPpG.csv` (all 2016 BP-Gene associations)
- Remote: 2024 GO annotations from gene-ontology repository
- `input/hetionet_neo4j_go_ids_nr.csv`
- `input/hetionet_neo4j_genes_ids_nr.csv`
- `input/Gene.tsv`

## Outputs
- `output/random_samples/dataset2_2016/random_001.csv` through `random_005.csv`
- `output/random_samples/dataset2_2024/random_001.csv` through `random_005.csv`

## Description
This notebook generates 5 random gene samples per year that control for gene promiscuity.
For each real gene in a GO term, we sample a random gene from OTHER GO terms
that has a similar number of GO annotations (promiscuity).

This approach ensures:
1. Same number of genes per GO term
2. Genes are sampled from other GO terms (not unannotated)
3. Promiscuity distribution is matched (within tolerance)
4. Multiple samples for robust statistical testing (matching permutation approach)

In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Setup repo root for consistent paths
# Works whether notebook is run from repo root or notebooks/ subdirectory
if Path.cwd().name == "notebooks":
    repo_root = Path("..").resolve()
else:
    repo_root = Path.cwd()

sys.path.insert(0, str(repo_root))
from src.random_sampling import (
    generate_promiscuity_controlled_samples,
    calculate_gene_promiscuity
)

(repo_root / 'output/random_samples/dataset2_2016').mkdir(
    parents=True, exist_ok=True
)
(repo_root / 'output/random_samples/dataset2_2024').mkdir(
    parents=True, exist_ok=True
)

print(f'Repo root: {repo_root}')
print('Environment setup complete')

## Load Data

Load filtered datasets and original GO annotations for promiscuity calculation.

In [None]:
# Load filtered datasets from notebook 1.3
real_2016 = pd.read_csv(
    repo_root / 'output/intermediate/hetio_bppg_dataset2_filtered.csv'
)
real_2024 = pd.read_csv(
    repo_root / 'output/intermediate/hetio_bppg_dataset2_2024_filtered.csv'
)

print('Filtered GO-Gene Associations Loaded')
print('=' * 80)
print(f'2016: {len(real_2016):,} GO-gene pairs')
print(f'      {real_2016["go_id"].nunique()} unique GO terms')
print(f'      {real_2016["neo4j_target_id"].nunique()} unique genes')

print(f'\n2024: {len(real_2024):,} GO-gene pairs')
print(f'      {real_2024["go_id"].nunique()} unique GO terms')
print(f'      {real_2024["neo4j_target_id"].nunique()} unique genes')

In [None]:
# Load ALL BP-Gene associations for promiscuity calculation
# We need the full dataset to accurately count how many terms each gene belongs to

# For 2016: Load original Hetionet BP-Gene edges from BPpG.csv
all_bpg_2016 = pd.read_csv(repo_root / 'input/BPpG.csv')

# Extract GO ID, gene ID (target_id), and create Neo4j source ID mapping
neo4j_go = pd.read_csv(repo_root / 'input/hetionet_neo4j_go_ids_nr.csv')
neo4j_genes = pd.read_csv(repo_root / 'input/hetionet_neo4j_genes_ids_nr.csv')

# BPpG.csv target_id is Hetionet's internal node index, need to map to Entrez ID then Neo4j API ID
# Load Gene.tsv to map Hetionet index to Entrez Gene ID
gene_mapping = pd.read_csv(repo_root / 'input/Gene.tsv', sep='\t')
gene_mapping['hetionet_internal_id'] = gene_mapping.index

# Map BPpG target_id (Hetionet index) to Entrez Gene ID
all_bpg_2016 = all_bpg_2016.merge(
    gene_mapping[['hetionet_internal_id', 'identifier']],
    left_on='target_id',
    right_on='hetionet_internal_id',
    how='inner'
)
all_bpg_2016 = all_bpg_2016.rename(columns={'identifier': 'entrez_gene_id'})
all_bpg_2016['entrez_gene_id'] = all_bpg_2016['entrez_gene_id'].astype(int)

# Map Entrez Gene ID to Neo4j API node ID
all_bpg_2016 = all_bpg_2016.merge(
    neo4j_genes[['entrez_gene_id', 'neo4j_target_id']],
    on='entrez_gene_id',
    how='inner'
)

# Map GO ID to Neo4j source ID
all_bpg_2016 = all_bpg_2016.merge(
    neo4j_go[['go_id', 'neo4j_source_id']],
    left_on='source_id',
    right_on='go_id',
    how='inner'
)

# Keep only needed columns
all_bpg_2016 = all_bpg_2016[['go_id', 'neo4j_source_id', 'neo4j_target_id']]

print(f'\nAll 2016 BP-Gene associations: {len(all_bpg_2016):,}')
print(f'  {all_bpg_2016["go_id"].nunique()} GO terms')
print(f'  {all_bpg_2016["neo4j_target_id"].nunique()} genes')

In [None]:
# For 2024: Load updated GO annotations from remote source
upd_go_2024_raw = pd.read_csv(
    'https://raw.githubusercontent.com/NegarJanani/gene-ontology/refs/heads/gh-pages/annotations/taxid_9606/GO_annotations-9606-inferred-allev.tsv',
    sep='\t'
)

# Expand gene IDs and symbols (same as notebook 1.1)
exp_df = upd_go_2024_raw.assign(
    gene_id=upd_go_2024_raw['gene_ids'].str.split('|'),
    gene_symbol=upd_go_2024_raw['gene_symbols'].str.split('|')
)
upd_go_2024_df = exp_df.explode(['gene_id', 'gene_symbol'])

# Clean data
upd_go_2024_df['gene_id'] = upd_go_2024_df['gene_id'].str.strip()
upd_go_2024_df['gene_symbol'] = upd_go_2024_df['gene_symbol'].str.strip()
upd_go_2024_df = upd_go_2024_df[upd_go_2024_df['gene_id'] != '...']
upd_go_2024_df['gene_id'] = upd_go_2024_df['gene_id'].astype(int)

# Filter to Biological Process
upd_go_bp_2024_df = upd_go_2024_df[
    upd_go_2024_df['go_domain'] == 'biological_process'
].copy()
upd_go_bp_2024_df = upd_go_bp_2024_df[['go_id', 'go_name', 'gene_id', 'gene_symbol']]
upd_go_bp_2024_df.rename(columns={'gene_id': 'entrez_gene_id'}, inplace=True)

# Filter to Hetionet genes only
hetio_genes_df = pd.read_csv(repo_root / 'input/Gene.tsv', sep='\t')
hetio_genes = hetio_genes_df['identifier'].unique()
upd_go_bp_2024_df = upd_go_bp_2024_df[
    upd_go_bp_2024_df['entrez_gene_id'].isin(hetio_genes)
]

# Map to Neo4j IDs
upd_go_bp_2024_df = upd_go_bp_2024_df.merge(
    neo4j_genes[['entrez_gene_id', 'neo4j_target_id']],
    on='entrez_gene_id',
    how='inner'
)

upd_go_bp_2024_df = upd_go_bp_2024_df.merge(
    neo4j_go[['go_id', 'neo4j_source_id']],
    on='go_id',
    how='inner'
)

all_bpg_2024 = upd_go_bp_2024_df[['go_id', 'neo4j_source_id', 'neo4j_target_id']]
all_bpg_2024 = all_bpg_2024.drop_duplicates()

print(f'\nAll 2024 BP-Gene associations: {len(all_bpg_2024):,}')
print(f'  {all_bpg_2024["go_id"].nunique()} GO terms')
print(f'  {all_bpg_2024["neo4j_target_id"].nunique()} genes')

## Calculate Gene Promiscuity

Count how many GO terms each gene belongs to across the full dataset.

In [None]:
# Calculate promiscuity for 2016
promiscuity_2016 = calculate_gene_promiscuity(
    all_bpg_2016,
    go_id_col='go_id',
    gene_id_col='neo4j_target_id'
)

print('2016 Gene Promiscuity Statistics')
print('=' * 80)
print(promiscuity_2016['promiscuity'].describe())

print(f'\nMost promiscuous genes (2016):')
top_2016 = promiscuity_2016.nlargest(10, 'promiscuity')
for _, row in top_2016.iterrows():
    print(f"  Gene {row['neo4j_target_id']}: "
          f"{row['promiscuity']} GO terms")

In [None]:
# Calculate promiscuity for 2024
promiscuity_2024 = calculate_gene_promiscuity(
    all_bpg_2024,
    go_id_col='go_id',
    gene_id_col='neo4j_target_id'
)

print('2024 Gene Promiscuity Statistics')
print('=' * 80)
print(promiscuity_2024['promiscuity'].describe())

print(f'\nMost promiscuous genes (2024):')
top_2024 = promiscuity_2024.nlargest(10, 'promiscuity')
for _, row in top_2024.iterrows():
    print(f"  Gene {row['neo4j_target_id']}: "
          f"{row['promiscuity']} GO terms")

## Generate Random Samples for 2016

Generate 5 independent random samples with different random seeds.

In [None]:
print('Generating 5 promiscuity-controlled random samples for 2016...')
print('=' * 80)

random_samples_2016 = []

for i in range(1, 6):
    print(f'\nGenerating random sample {i}/5...')
    
    random_sample = generate_promiscuity_controlled_samples(
        go_gene_df=real_2016,
        all_go_annotations=all_bpg_2016,
        go_id_col='go_id',
        gene_id_col='neo4j_target_id',
        source_id_col='neo4j_source_id',
        promiscuity_tolerance=2,
        random_state=42 + i
    )
    
    random_samples_2016.append(random_sample)
    
    print(f'  Generated {len(random_sample):,} pairs, '
          f'{random_sample["go_id"].nunique()} GO terms, '
          f'{random_sample["neo4j_pseudo_target_id"].nunique()} unique genes')
    print(f'  Real promiscuity: mean={random_sample["real_promiscuity"].mean():.2f}')
    print(f'  Random promiscuity: mean={random_sample["sampled_promiscuity"].mean():.2f}')

print('\n2016 random samples complete')

## Generate Random Samples for 2024

Generate 5 independent random samples for 2024 data.

In [None]:
print('Generating 5 promiscuity-controlled random samples for 2024...')
print('=' * 80)

random_samples_2024 = []

for i in range(1, 6):
    print(f'\nGenerating random sample {i}/5...')
    
    random_sample = generate_promiscuity_controlled_samples(
        go_gene_df=real_2024,
        all_go_annotations=all_bpg_2024,
        go_id_col='go_id',
        gene_id_col='neo4j_target_id',
        source_id_col='neo4j_source_id',
        promiscuity_tolerance=2,
        random_state=42 + i
    )
    
    random_samples_2024.append(random_sample)
    
    print(f'  Generated {len(random_sample):,} pairs, '
          f'{random_sample["go_id"].nunique()} GO terms, '
          f'{random_sample["neo4j_pseudo_target_id"].nunique()} unique genes')
    print(f'  Real promiscuity: mean={random_sample["real_promiscuity"].mean():.2f}')
    print(f'  Random promiscuity: mean={random_sample["sampled_promiscuity"].mean():.2f}')

print('\n2024 random samples complete')

## Validation

Verify that random samples have expected properties.

In [None]:
print('\n' + '=' * 80)
print('VALIDATION: Sample Sizes')
print('=' * 80)

# Validate all 2016 samples
for i, random_2016 in enumerate(random_samples_2016, start=1):
    real_2016_sizes = real_2016.groupby('go_id').size()
    random_2016_sizes = random_2016.groupby('go_id').size()
    
    if (real_2016_sizes == random_2016_sizes).all():
        print(f'2016 Sample {i}: Sample sizes match (PASS)')
    else:
        print(f'2016 Sample {i}: Sample sizes differ (FAIL)')

# Validate all 2024 samples
for i, random_2024 in enumerate(random_samples_2024, start=1):
    real_2024_sizes = real_2024.groupby('go_id').size()
    random_2024_sizes = random_2024.groupby('go_id').size()
    
    if (real_2024_sizes == random_2024_sizes).all():
        print(f'2024 Sample {i}: Sample sizes match (PASS)')
    else:
        print(f'2024 Sample {i}: Sample sizes differ (FAIL)')

In [None]:
print('\n' + '=' * 80)
print('VALIDATION: No Overlap with Real Genes per GO Term')
print('=' * 80)

# Check all 2016 samples
for i, random_2016 in enumerate(random_samples_2016, start=1):
    overlap_count = 0
    for go_id in real_2016['go_id'].unique():
        real_genes = set(
            real_2016[real_2016['go_id'] == go_id]['neo4j_target_id']
        )
        random_genes = set(
            random_2016[random_2016['go_id'] == go_id]['neo4j_pseudo_target_id']
        )
        overlap = len(real_genes & random_genes)
        overlap_count += overlap
    
    if overlap_count == 0:
        print(f'2016 Sample {i}: No overlap (PASS)')
    else:
        print(f'2016 Sample {i}: Found {overlap_count} overlaps (FAIL)')

# Check all 2024 samples
for i, random_2024 in enumerate(random_samples_2024, start=1):
    overlap_count = 0
    for go_id in real_2024['go_id'].unique():
        real_genes = set(
            real_2024[real_2024['go_id'] == go_id]['neo4j_target_id']
        )
        random_genes = set(
            random_2024[random_2024['go_id'] == go_id]['neo4j_pseudo_target_id']
        )
        overlap = len(real_genes & random_genes)
        overlap_count += overlap
    
    if overlap_count == 0:
        print(f'2024 Sample {i}: No overlap (PASS)')
    else:
        print(f'2024 Sample {i}: Found {overlap_count} overlaps (FAIL)')

In [None]:
print('\n' + '=' * 80)
print('VALIDATION: All Random Genes Are Annotated')
print('=' * 80)

all_annotated_2016 = set(all_bpg_2016['neo4j_target_id'])
all_annotated_2024 = set(all_bpg_2024['neo4j_target_id'])

# Check all 2016 samples
for i, random_2016 in enumerate(random_samples_2016, start=1):
    random_genes = set(random_2016['neo4j_pseudo_target_id'])
    unannotated = random_genes - all_annotated_2016
    
    if len(unannotated) == 0:
        print(f'2016 Sample {i}: All genes annotated (PASS)')
    else:
        print(f'2016 Sample {i}: Found {len(unannotated)} unannotated (FAIL)')

# Check all 2024 samples
for i, random_2024 in enumerate(random_samples_2024, start=1):
    random_genes = set(random_2024['neo4j_pseudo_target_id'])
    unannotated = random_genes - all_annotated_2024
    
    if len(unannotated) == 0:
        print(f'2024 Sample {i}: All genes annotated (PASS)')
    else:
        print(f'2024 Sample {i}: Found {len(unannotated)} unannotated (FAIL)')

## Save Outputs

In [None]:
print('\nSaving random samples...')

random_dir_2016 = repo_root / 'output/random_samples/dataset2_2016'
random_dir_2024 = repo_root / 'output/random_samples/dataset2_2024'

# Save 2016 samples
for i, random_sample in enumerate(random_samples_2016, start=1):
    output_path = random_dir_2016 / f'random_{i:03d}.csv'
    random_sample.to_csv(output_path, index=False)
    print(f'  Saved random_{i:03d}.csv: {len(random_sample):,} pairs')

# Save 2024 samples
for i, random_sample in enumerate(random_samples_2024, start=1):
    output_path = random_dir_2024 / f'random_{i:03d}.csv'
    random_sample.to_csv(output_path, index=False)
    print(f'  Saved random_{i:03d}.csv: {len(random_sample):,} pairs')

print('\n' + '=' * 80)
print('NOTEBOOK 1.5 COMPLETE')
print('=' * 80)

print('\nGenerated Control Datasets:')
print('  Permuted datasets (1.4): 5 permutations per year (shuffle GO labels)')
print('  Random datasets (1.5): 5 random samples per year (promiscuity-controlled)')
print('  Total: 20 control datasets + 2 real datasets = 22 datasets')

print('\nOutput Files:')
print('  output/permutations/dataset2_2016/perm_001.csv through perm_005.csv')
print('  output/permutations/dataset2_2024/perm_001.csv through perm_005.csv')
print('  output/random_samples/dataset2_2016/random_001.csv through random_005.csv')
print('  output/random_samples/dataset2_2024/random_001.csv through random_005.csv')

print('\nNext Steps:')
print('  Run notebook 2 to compute DWPC for all 22 datasets')
print('  Expected runtime: Variable (depends on API performance)')
print('  Each dataset takes 30-60 minutes')