# Condition-Specific Gap-filling (Proper Protocol)

**Ticket**: CDMSCI-199

## Objective

For each false negative (experimental growth = 1, model predicted = 0), run condition-specific gap-filling using the **complete ModelSEEDpy protocol**:

1. Load draft model (after ATP correction from CDMSCI-198)
2. Apply specific carbon source media
3. Run ATP correction with Core-V5.2 for that media
4. Run genome-scale gap-filling with GramNegModelTemplateV6
5. Track all reactions added

## Research Question

**Can condition-specific gap-filling rescue false negatives, and is it adding meaningful biology or overfitting?**

## Workflow

1. Load false negatives from CDMSCI-199
2. Test on one organism × carbon source pair
3. Run on all 571 false negatives (estimated 5-10 hours)

## Setup

In [None]:
import cobra
from cobra.io import load_json_model, save_json_model
import pandas as pd
import json
from pathlib import Path
from tqdm.notebook import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# ModelSEEDpy imports
from modelseedpy import MSMedia, MSGapfill, MSBuilder, MSATPCorrection
from modelseedpy.core.mstemplate import MSTemplateBuilder
from modelseedpy.core.msmodel import get_reaction_constraints_from_direction
from modelseedpy.core.msatpcorrection import load_default_medias

print(f"Imports successful")

## Load Input Data

In [None]:
# Paths
models_dir = Path('../CDMSCI-198-build-models/models')
media_dir = Path('../CDMSCI-197-media-formulations/media')
false_negatives_file = Path('results/false_negatives.csv')
core_template_path = Path('../references/build_metabolic_model/Core-V5.2.json')
gramneg_template_path = Path('../references/build_metabolic_model/GramNegModelTemplateV6.json')

# Load false negatives
fn_df = pd.read_csv(false_negatives_file)

# Add orgId
org_metadata = pd.read_csv('results/organism_metadata.csv')
fn_df = fn_df.merge(org_metadata[['organism', 'orgId']], on='organism', how='left')

print(f"Loaded {len(fn_df)} false negatives")
print(f"Missing orgId: {fn_df['orgId'].isna().sum()}")
print(f"\nFirst 5:")
print(fn_df[['organism', 'carbon_source', 'orgId']].head())

## Load Templates

In [None]:
print("Loading templates...")

# Core-V5.2 for ATP correction
with open(core_template_path) as fh:
    template_core = MSTemplateBuilder.from_dict(json.load(fh)).build()
print(f"Core-V5.2: {len(template_core.reactions):,} reactions")

# GramNegModelTemplateV6 for genome-scale gapfilling
with open(gramneg_template_path) as fh:
    template_gramneg = MSTemplateBuilder.from_dict(json.load(fh)).build()
print(f"GramNegModelTemplateV6: {len(template_gramneg.reactions):,} reactions")

# Load default medias for ATP correction
default_medias = load_default_medias()
print(f"Default medias for ATP correction: {len(default_medias)}")

## Helper Functions

In [None]:
def integrate_gapfill_solution(template, model, gapfill_result):
    """Integrate gapfill solution into model."""
    added_reactions = []
    gap_sol = {}
    
    for rxn_id, direction in gapfill_result.get('new', {}).items():
        if rxn_id.startswith('EX_'):
            continue
        
        if rxn_id.endswith('0'):
            template_rxn_id = rxn_id[:-1]
        else:
            template_rxn_id = rxn_id
        
        if template_rxn_id in template.reactions:
            gap_sol[template_rxn_id] = get_reaction_constraints_from_direction(direction)
    
    for rxn_id, (lb, ub) in gap_sol.items():
        template_reaction = template.reactions.get_by_id(rxn_id)
        model_reaction = template_reaction.to_reaction(model)
        model_reaction.lower_bound = lb
        model_reaction.upper_bound = ub
        added_reactions.append(model_reaction)
    
    model.add_reactions(added_reactions)
    add_exchanges = MSBuilder.add_exchanges_to_model(model)
    
    return added_reactions, add_exchanges

def apply_media_to_model(media, model, prefix='EX_'):
    """Apply media constraints to model."""
    import math
    medium = {}
    for cpd, (lb, ub) in media.get_media_constraints().items():
        rxn_exchange = f'{prefix}{cpd}'
        if rxn_exchange in model.reactions:
            medium[rxn_exchange] = math.fabs(lb)
    return medium

print("Helper functions defined")

## Test on One False Negative

Test the complete workflow on one organism × carbon source before running on all 571.

In [None]:
test_row = fn_df.iloc[0]
test_organism = test_row['organism']
test_org_id = test_row['orgId']
test_carbon_source = test_row['carbon_source']

print(f"="*80)
print(f"TESTING CONDITION-SPECIFIC GAP-FILLING")
print(f"="*80)
print(f"Organism: {test_organism} ({test_org_id})")
print(f"Carbon source: {test_carbon_source}")
print()

# Load draft model
draft_model_path = models_dir / f'{test_org_id}_draft.json'
model = load_json_model(str(draft_model_path))
print(f"Step 1: Loaded draft model ({len(model.reactions)} reactions)")
print()

# Load carbon source media
possible_names = [
    f"{test_carbon_source}.json",
    f"{test_carbon_source.replace(' ', '_')}.json",
]
media_path = None
for name in possible_names:
    test_path = media_dir / name
    if test_path.exists():
        media_path = test_path
        break

with open(media_path, 'r') as f:
    media_dict = json.load(f)
carbon_media = MSMedia.from_dict(media_dict)
print(f"Step 2: Loaded media ({media_path.name})")
print()

# Test pre-gapfill
model.medium = apply_media_to_model(carbon_media, model)
model.objective = 'bio1'
pre_solution = model.optimize()
pre_flux = pre_solution.objective_value
print(f"Step 3: Pre-gapfill flux = {pre_flux:.6f}")
print()

# ATP Correction for this specific media
print(f"Step 4: ATP correction with Core-V5.2 for {test_carbon_source}...")
# Create test medias list with just this carbon source
test_medias = [(carbon_media, 0.01)]

atp_correction = MSATPCorrection(
    model,
    template_core,
    test_medias,
    compartment='c0',
    atp_hydrolysis_id='ATPM_c0',
    load_default_medias=False
)

media_eval = atp_correction.evaluate_growth_media()
atp_correction.determine_growth_media()
atp_correction.apply_growth_media_gapfilling()
atp_correction.expand_model_to_genome_scale()
tests = atp_correction.build_tests()

atp_reactions_added = len(model.reactions) - len(load_json_model(str(draft_model_path)).reactions)
print(f"  ATP correction added {atp_reactions_added} reactions")
print()

# Test after ATP correction
model.medium = apply_media_to_model(carbon_media, model)
model.objective = 'bio1'
post_atp_solution = model.optimize()
post_atp_flux = post_atp_solution.objective_value
print(f"Step 5: Post-ATP-correction flux = {post_atp_flux:.6f}")
print()

# Genome-scale gap-filling if needed
if post_atp_flux < 0.001:
    print(f"Step 6: Genome-scale gap-filling with GramNegModelTemplateV6...")
    gapfiller = MSGapfill(
        model,
        default_gapfill_templates=[template_gramneg],
        test_conditions=tests,
        default_target='bio1'
    )
    
    gapfill_result = gapfiller.run_gapfilling(carbon_media)
    num_gapfilled = len(gapfill_result.get('new', {}))
    print(f"  Found {num_gapfilled} reactions to add")
    
    if num_gapfilled > 0:
        added_rxns, added_exch = integrate_gapfill_solution(template_gramneg, model, gapfill_result)
        print(f"  Integrated {len(added_rxns)} reactions, {len(added_exch)} exchanges")
        
        model.medium = apply_media_to_model(carbon_media, model)
        model.objective = 'bio1'
        post_solution = model.optimize()
        post_flux = post_solution.objective_value
        print(f"  Post-gapfill flux = {post_flux:.6f}")
    else:
        post_flux = post_atp_flux
        print(f"  No gapfill solution found")
else:
    print(f"Step 6: No genome-scale gap-filling needed (already grows)")
    post_flux = post_atp_flux
    num_gapfilled = 0

print()
print(f"="*80)
print(f"TEST COMPLETE")
print(f"="*80)
print(f"Pre-gapfill flux: {pre_flux:.6f}")
print(f"Post-ATP-correction flux: {post_atp_flux:.6f}")
print(f"Post-gapfill flux: {post_flux:.6f}")
print(f"Success: {post_flux > 0.001}")

## Run on All False Negatives

Run condition-specific gap-filling on all 571 false negatives.

**This will take 5-10 hours.**

In [None]:
GROWTH_THRESHOLD = 0.001

results = []
reaction_details = []
errors = []

start_time = time.time()

print(f"Starting {len(fn_df)} condition-specific gap-filling experiments...")
print(f"Start time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
print()

for idx, row in tqdm(fn_df.iterrows(), total=len(fn_df), desc="Gap-filling FNs"):
    org_id = row.get('orgId')
    organism = row['organism']
    carbon_source = row['carbon_source']
    
    if pd.isna(org_id):
        errors.append({'organism': organism, 'carbon_source': carbon_source, 'error': 'Missing orgId'})
        continue
    
    # Load draft model
    draft_model_path = models_dir / f'{org_id}_draft.json'
    if not draft_model_path.exists():
        errors.append({'organism': organism, 'carbon_source': carbon_source, 'error': 'Model not found'})
        continue
    
    # Load media
    possible_names = [f"{carbon_source}.json", f"{carbon_source.replace(' ', '_')}.json"]
    media_path = None
    for name in possible_names:
        test_path = media_dir / name
        if test_path.exists():
            media_path = test_path
            break
    
    if media_path is None:
        errors.append({'organism': organism, 'carbon_source': carbon_source, 'error': 'Media not found'})
        continue
    
    try:
        # Load
        model = load_json_model(str(draft_model_path))
        with open(media_path, 'r') as f:
            media_dict = json.load(f)
        carbon_media = MSMedia.from_dict(media_dict)
        
        # Pre-gapfill test
        model.medium = apply_media_to_model(carbon_media, model)
        model.objective = 'bio1'
        pre_flux = model.optimize().objective_value
        
        if pre_flux > GROWTH_THRESHOLD:
            errors.append({'organism': organism, 'carbon_source': carbon_source, 'error': 'Already grows'})
            continue
        
        # ATP correction
        test_medias = [(carbon_media, 0.01)]
        atp_correction = MSATPCorrection(
            model, template_core, test_medias,
            compartment='c0', atp_hydrolysis_id='ATPM_c0', load_default_medias=False
        )
        atp_correction.evaluate_growth_media()
        atp_correction.determine_growth_media()
        atp_correction.apply_growth_media_gapfilling()
        atp_correction.expand_model_to_genome_scale()
        tests = atp_correction.build_tests()
        
        # Post-ATP test
        model.medium = apply_media_to_model(carbon_media, model)
        model.objective = 'bio1'
        post_atp_flux = model.optimize().objective_value
        
        # Genome-scale gapfilling
        if post_atp_flux < GROWTH_THRESHOLD:
            gapfiller = MSGapfill(model, default_gapfill_templates=[template_gramneg],
                                  test_conditions=tests, default_target='bio1')
            gapfill_result = gapfiller.run_gapfilling(carbon_media)
            
            if len(gapfill_result.get('new', {})) > 0:
                added_rxns, added_exch = integrate_gapfill_solution(template_gramneg, model, gapfill_result)
                model.medium = apply_media_to_model(carbon_media, model)
                model.objective = 'bio1'
                post_flux = model.optimize().objective_value
                
                # Record
                results.append({
                    'organism': organism, 'orgId': org_id, 'carbon_source': carbon_source,
                    'pre_flux': pre_flux, 'post_atp_flux': post_atp_flux, 'post_flux': post_flux,
                    'success': post_flux > GROWTH_THRESHOLD,
                    'num_reactions_added': len(added_rxns), 'num_exchanges_added': len(added_exch)
                })
                
                for rxn in added_rxns:
                    reaction_details.append({
                        'organism': organism, 'carbon_source': carbon_source,
                        'reaction_id': rxn.id, 'reaction_name': rxn.name,
                        'reaction_formula': rxn.build_reaction_string(), 'subsystem': rxn.subsystem
                    })
            else:
                results.append({
                    'organism': organism, 'orgId': org_id, 'carbon_source': carbon_source,
                    'pre_flux': pre_flux, 'post_atp_flux': post_atp_flux, 'post_flux': post_atp_flux,
                    'success': False, 'num_reactions_added': 0, 'num_exchanges_added': 0
                })
        else:
            results.append({
                'organism': organism, 'orgId': org_id, 'carbon_source': carbon_source,
                'pre_flux': pre_flux, 'post_atp_flux': post_atp_flux, 'post_flux': post_atp_flux,
                'success': True, 'num_reactions_added': 0, 'num_exchanges_added': 0
            })
    
    except Exception as e:
        errors.append({'organism': organism, 'carbon_source': carbon_source, 'error': str(e)[:100]})

elapsed_time = time.time() - start_time
print(f"\nCompleted in {elapsed_time/60:.1f} minutes")
print(f"Experiments: {len(results)}")
print(f"Errors: {len(errors)}")

## Save Results

In [None]:
# Save results
results_df = pd.DataFrame(results)
results_df.to_csv('results/condition_specific_gapfilling_results.csv', index=False)
print(f"Saved: results/condition_specific_gapfilling_results.csv ({len(results_df)} rows)")

reactions_df = pd.DataFrame(reaction_details)
reactions_df.to_csv('results/condition_specific_gapfilling_reactions.csv', index=False)
print(f"Saved: results/condition_specific_gapfilling_reactions.csv ({len(reactions_df)} rows)")

if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv('results/condition_specific_gapfilling_errors.csv', index=False)
    print(f"Saved: results/condition_specific_gapfilling_errors.csv ({len(errors_df)} rows)")

## Summary Statistics

In [None]:
print(f"="*80)
print(f"SUMMARY")
print(f"="*80)
print(f"Total experiments: {len(results_df)}")
print(f"Successful: {results_df['success'].sum()} ({100*results_df['success'].mean():.1f}%)")
print(f"Failed: {(~results_df['success']).sum()}")
print(f"Errors: {len(errors)}")
print()
print(f"Reactions added statistics:")
print(results_df['num_reactions_added'].describe())

## Complete

Condition-specific gap-filling complete using proper protocol:
1. ATP correction with Core-V5.2 for each carbon source
2. Genome-scale gap-filling with GramNegModelTemplateV6

Next steps:
1. Analyze which reactions were added
2. Assess biological plausibility
3. Compare to pyruvate gap-filling