# CDMSCI-199: Condition-Specific Gap-filling Experiment

## Objective

For each of the 571 false negative cases (where experimental growth = 1 but model predicted = 0), run condition-specific gap-filling and track:
- Did gap-filling succeed?
- How many reactions were added?
- Which specific reactions were added?
- Is this meaningful biology or overfitting?

## Research Question

**Can condition-specific gap-filling rescue false negatives, and if so, is it adding meaningful biology or just overfitting?**

## Runtime

Estimated: 5-10 hours for 571 gap-filling experiments

## Outputs

1. `results/condition_specific_gapfilling_results.csv` - Main results
2. `results/condition_specific_gapfilling_reactions.csv` - Detailed reactions
3. `results/condition_specific_gapfilling_errors.csv` - Error log

## Setup and Imports

In [None]:
import cobra
from cobra.flux_analysis import gapfill
import pandas as pd
import json
from pathlib import Path
from tqdm.notebook import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

print(f"COBRApy version: {cobra.__version__}")
print(f"Pandas version: {pd.__version__}")

## Load Input Data

In [None]:
# Paths
models_dir = Path('../CDMSCI-198-build-models/models')
media_dir = Path('../CDMSCI-197-media-formulations/media')
false_negatives_file = Path('results/false_negatives.csv')
universal_model_path = Path('../CDMSCI-198-build-models/GramNegModelTemplateV6.json')

# Load false negatives
fn_df = pd.read_csv(false_negatives_file)
print(f"Loaded {len(fn_df)} false negatives to process")
print(f"\nFirst 5 FNs:")
print(fn_df[['organism', 'carbon_source', 'biomass_flux']].head())

In [None]:
# Need to add orgId to FN dataframe if missing
# Load organism metadata
org_metadata = pd.read_csv('results/organism_metadata.csv')

# Merge to get orgId
fn_df = fn_df.merge(org_metadata[['organism', 'orgId']], on='organism', how='left')

print(f"Added orgId column")
print(f"Missing orgId: {fn_df['orgId'].isna().sum()}")
if fn_df['orgId'].isna().sum() > 0:
    print("\nOrganisms with missing orgId:")
    print(fn_df[fn_df['orgId'].isna()]['organism'].unique())

## Load Universal Model Template

In [None]:
print("Loading universal model template...")
try:
    universal = cobra.io.load_json_model(str(universal_model_path))
    print(f"✓ Universal model loaded")
    print(f"  Reactions: {len(universal.reactions):,}")
    print(f"  Metabolites: {len(universal.metabolites):,}")
    print(f"  Genes: {len(universal.genes):,}")
except Exception as e:
    print(f"✗ ERROR: Could not load universal model: {e}")
    raise

## Configuration

In [None]:
# Growth threshold
GROWTH_THRESHOLD = 0.001  # h^-1

# Results storage
results = []
reaction_details = []
errors = []

print(f"Configuration:")
print(f"  Growth threshold: {GROWTH_THRESHOLD} h^-1")
print(f"  Total experiments: {len(fn_df)}")
print(f"  Estimated time: {len(fn_df) * 30 / 3600:.1f} - {len(fn_df) * 60 / 3600:.1f} hours")

## Run Gap-filling Experiments

**This will take 5-10 hours. Progress bar will update in real-time.**

You can:
- Leave this running and check back later
- Monitor the progress bar
- Stop anytime (Ctrl+C in terminal or Interrupt kernel)

In [None]:
# Track timing
start_time = time.time()

print(f"Starting {len(fn_df)} gap-filling experiments...")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print()

# Main loop with progress bar
for idx, row in tqdm(fn_df.iterrows(), total=len(fn_df), desc="Gap-filling FNs"):
    org_id = row.get('orgId')
    
    if pd.isna(org_id):
        errors.append({
            'organism': row['organism'],
            'orgId': '',
            'carbon_source': row['carbon_source'],
            'error': 'Missing orgId'
        })
        continue
    
    organism = row['organism']
    carbon_source = row['carbon_source']
    
    # Construct paths
    draft_model_path = models_dir / f'{org_id}_draft.json'
    
    # Try to find media file (handle different naming)
    media_path = None
    possible_names = [
        f"{carbon_source}.json",
        f"{carbon_source.replace(' ', '_')}.json",
        f"{carbon_source.replace(',', '').replace(' ', '_')}.json",
    ]
    
    for name in possible_names:
        test_path = media_dir / name
        if test_path.exists():
            media_path = test_path
            break
    
    # Check if files exist
    if not draft_model_path.exists():
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': 'Draft model file not found'
        })
        continue
    
    if media_path is None:
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': 'Media file not found'
        })
        continue
    
    # Load draft model
    try:
        model = cobra.io.load_json_model(str(draft_model_path))
    except Exception as e:
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': f'Model load error: {str(e)[:100]}'
        })
        continue
    
    # Load media
    try:
        with open(media_path, 'r') as f:
            media_dict = json.load(f)
    except Exception as e:
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': f'Media load error: {str(e)[:100]}'
        })
        continue
    
    # Apply media
    try:
        model.medium = media_dict
    except Exception as e:
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': f'Media application error: {str(e)[:100]}'
        })
        continue
    
    # Test pre-gapfill flux
    try:
        pre_gapfill_solution = model.optimize()
        pre_gapfill_flux = pre_gapfill_solution.objective_value
    except:
        pre_gapfill_flux = 0.0
    
    if pre_gapfill_flux > GROWTH_THRESHOLD:
        # Shouldn't happen for a false negative
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': f'Draft already grows (flux={pre_gapfill_flux:.4f})'
        })
        continue
    
    # Run gap-filling
    try:
        solutions = gapfill(model, universal, demand_reactions=False)
        
        if len(solutions) > 0:
            # Take first solution
            gapfill_reactions = list(solutions[0])
            num_reactions_added = len(gapfill_reactions)
            
            # Add reactions to model
            for reaction in gapfill_reactions:
                model.add_reactions([reaction.copy()])
            
            # Re-optimize
            post_gapfill_solution = model.optimize()
            post_gapfill_flux = post_gapfill_solution.objective_value
            gapfill_success = post_gapfill_flux > GROWTH_THRESHOLD
            
            # Record result
            results.append({
                'organism': organism,
                'orgId': org_id,
                'carbon_source': carbon_source,
                'media_filename': media_path.name,
                'pre_gapfill_flux': pre_gapfill_flux,
                'post_gapfill_flux': post_gapfill_flux,
                'gapfill_success': gapfill_success,
                'num_reactions_added': num_reactions_added,
                'reactions_added': ';'.join([r.id for r in gapfill_reactions]),
                'gapfill_solutions_count': len(solutions)
            })
            
            # Record detailed reactions
            for reaction in gapfill_reactions:
                reaction_details.append({
                    'organism': organism,
                    'orgId': org_id,
                    'carbon_source': carbon_source,
                    'reaction_id': reaction.id,
                    'reaction_name': reaction.name,
                    'reaction_formula': reaction.build_reaction_string(),
                    'subsystem': reaction.subsystem
                })
        else:
            # No solution found
            results.append({
                'organism': organism,
                'orgId': org_id,
                'carbon_source': carbon_source,
                'media_filename': media_path.name,
                'pre_gapfill_flux': pre_gapfill_flux,
                'post_gapfill_flux': 0.0,
                'gapfill_success': False,
                'num_reactions_added': 0,
                'reactions_added': '',
                'gapfill_solutions_count': 0
            })
    
    except Exception as e:
        errors.append({
            'organism': organism,
            'orgId': org_id,
            'carbon_source': carbon_source,
            'error': f'Gap-filling error: {str(e)[:100]}'
        })

# Done!
elapsed_time = time.time() - start_time
print(f"\n{'='*80}")
print(f"COMPLETED!")
print(f"{'='*80}")
print(f"End time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time: {elapsed_time/60:.1f} minutes ({elapsed_time/3600:.2f} hours)")
print(f"Experiments completed: {len(results)}")
print(f"Errors: {len(errors)}")

## Save Results

In [None]:
# Save main results
results_df = pd.DataFrame(results)
results_df.to_csv('results/condition_specific_gapfilling_results.csv', index=False)
print(f"✓ Saved main results: results/condition_specific_gapfilling_results.csv")
print(f"  Rows: {len(results_df):,}")

# Save detailed reactions
reactions_df = pd.DataFrame(reaction_details)
reactions_df.to_csv('results/condition_specific_gapfilling_reactions.csv', index=False)
print(f"✓ Saved detailed reactions: results/condition_specific_gapfilling_reactions.csv")
print(f"  Rows: {len(reactions_df):,}")

# Save errors
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv('results/condition_specific_gapfilling_errors.csv', index=False)
    print(f"✓ Saved errors: results/condition_specific_gapfilling_errors.csv")
    print(f"  Rows: {len(errors_df):,}")

## Summary Statistics

In [None]:
print(f"{'='*80}")
print(f"SUMMARY STATISTICS")
print(f"{'='*80}")
print()
print(f"Total experiments: {len(results_df)}")
print(f"Successful gap-filling: {results_df['gapfill_success'].sum()} ({100*results_df['gapfill_success'].mean():.1f}%)")
print(f"Failed gap-filling: {(~results_df['gapfill_success']).sum()} ({100*(~results_df['gapfill_success']).mean():.1f}%)")
print(f"Errors: {len(errors)}")
print()

if len(results_df) > 0:
    print("Reactions added statistics:")
    print(results_df['num_reactions_added'].describe())
    print()
    print(f"Mean reactions added: {results_df['num_reactions_added'].mean():.1f}")
    print(f"Median reactions added: {results_df['num_reactions_added'].median():.1f}")
    print(f"Max reactions added: {results_df['num_reactions_added'].max()}")

## Top Reactions Added

In [None]:
if len(reactions_df) > 0:
    from collections import Counter
    
    rxn_counts = Counter(reactions_df['reaction_id'])
    
    print("Top 30 most frequently added reactions:")
    print()
    for i, (rxn_id, count) in enumerate(rxn_counts.most_common(30), 1):
        pct = 100 * count / len(results_df)
        # Get reaction name
        rxn_name = reactions_df[reactions_df['reaction_id'] == rxn_id]['reaction_name'].iloc[0]
        print(f"{i:2d}. {rxn_id:20s} ({rxn_name[:50]:50s}): {count:4d} times ({pct:5.1f}%)")

## Quick Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Success vs failure
results_df['gapfill_success'].value_counts().plot(kind='bar', ax=axes[0, 0], color=['#e74c3c', '#27ae60'])
axes[0, 0].set_title('Gap-filling Success Rate', fontweight='bold')
axes[0, 0].set_xlabel('Success')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_xticklabels(['Failed', 'Success'], rotation=0)

# Reactions added distribution
results_df[results_df['gapfill_success']]['num_reactions_added'].hist(bins=30, ax=axes[0, 1], color='steelblue')
axes[0, 1].set_title('Distribution of Reactions Added (Successful Only)', fontweight='bold')
axes[0, 1].set_xlabel('Number of Reactions Added')
axes[0, 1].set_ylabel('Count')

# Post-gapfill flux distribution
results_df[results_df['gapfill_success']]['post_gapfill_flux'].hist(bins=30, ax=axes[1, 0], color='forestgreen')
axes[1, 0].set_title('Post-Gapfill Biomass Flux (Successful Only)', fontweight='bold')
axes[1, 0].set_xlabel('Biomass Flux (h^-1)')
axes[1, 0].set_ylabel('Count')

# Success rate by organism
org_success = results_df.groupby('organism')['gapfill_success'].agg(['sum', 'count', 'mean']).sort_values('mean', ascending=False).head(20)
org_success['mean'].plot(kind='barh', ax=axes[1, 1], color='coral')
axes[1, 1].set_title('Top 20 Organisms by Gap-fill Success Rate', fontweight='bold')
axes[1, 1].set_xlabel('Success Rate')

plt.tight_layout()
plt.savefig('results/condition_specific_gapfilling_summary.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Saved: results/condition_specific_gapfilling_summary.png")

## Experiment Complete!

Next steps:
1. Review `results/condition_specific_gapfilling_results.csv`
2. Analyze which reactions are most frequently added
3. Assess biological plausibility of added reactions
4. Compare to pyruvate gap-filling reactions
5. Make recommendations on multi-condition gap-filling approach