# Create Combined Growth Matrix (Integrated Data)

**Purpose**: Integrate gold standard and Fitness Browser data with proper hierarchy

**Data Strategy (Tiered Approach)**:
- **Tier 1 (Gold Standard)**: Supplementary Table S2 - use for BOTH growth and no-growth
- **Tier 2 (Secondary)**: Fitness Browser - use for growth calls ONLY (if experiment exists)
- **Important**: Do NOT assume absence of Fitness Browser data means "No Growth"

**Why This Strategy?**:
- 287 cases where Supplementary Table shows growth but Fitness Browser has no data
- All Fitness Browser experiments already pass quality filters (gMed >= 50, mad12 <= 0.5)
- Missing data could mean: never tested, failed experiment, or not published

**Output File**:
- `results/combined_growth_matrix.csv` - Master growth matrix (57 organisms × all carbon sources)

**Values**:
- `Growth` - Confirmed growth (from either source)
- `No Growth` - Confirmed no growth (from Supplementary Table ONLY)
- ` ` (blank) - Unknown/not tested

**Last updated**: 2025-10-06

## Setup

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path
import re

print("Imports successful")

## Configuration

In [None]:
# Paths
DB_PATH = Path("../downloads/feba.db")
RESULTS_DIR = Path("results")

# Input files (created by previous notebooks)
SUPP_TABLE_FILE = RESULTS_DIR / "supplementary_table_s2_clean.csv"
ORGANISM_METADATA_FILE = RESULTS_DIR / "organism_metadata.csv"

# Output file
OUTPUT_FILE = RESULTS_DIR / "combined_growth_matrix.csv"

print(f"Configuration set")
print(f"  Database: {DB_PATH}")
print(f"  Supplementary table: {SUPP_TABLE_FILE}")
print(f"  Organism metadata: {ORGANISM_METADATA_FILE}")
print(f"  Output: {OUTPUT_FILE}")

## Load Tier 1 Data (Gold Standard)

In [None]:
print("Loading Tier 1 data (Supplementary Table S2)...")
supp_data = pd.read_csv(SUPP_TABLE_FILE, index_col=0)

print(f"\nLoaded gold standard data:")
print(f"  Shape: {supp_data.shape[0]} carbon sources × {supp_data.shape[1]} organisms")
print(f"  Growth calls: {(supp_data == 'Growth').sum().sum()}")
print(f"  No Growth calls: {(supp_data == 'No Growth').sum().sum()}")

print(f"\nOrganisms in supplementary table:")
for org in supp_data.columns:
    print(f"  - {org}")

## Load Organism Metadata

In [None]:
print("\nLoading organism metadata...")
metadata = pd.read_csv(ORGANISM_METADATA_FILE)

print(f"Loaded metadata for {len(metadata)} organisms")
print(f"\nCreating orgId to Species_Name mapping...")

# Create mapping dictionaries
orgid_to_species = dict(zip(metadata['orgId'], metadata['Species_Name']))
species_to_orgid = dict(zip(metadata['Species_Name'], metadata['orgId']))

print(f"Created mappings for {len(orgid_to_species)} organisms")

## Load Tier 2 Data (Fitness Browser)

In [None]:
print("\nLoading Tier 2 data (Fitness Browser database)...")
conn = sqlite3.connect(str(DB_PATH))

# Query carbon source experiments
# Note: ALL experiments in feba.db already pass quality filters
query = """
SELECT DISTINCT
    e.orgId,
    e.condition_1 as carbon_source,
    COUNT(DISTINCT e.expName) as n_experiments
FROM Experiment e
WHERE e.expGroup = 'carbon source'
  AND e.num > 0
GROUP BY e.orgId, e.condition_1
ORDER BY e.orgId, e.condition_1
"""

fb_data = pd.read_sql_query(query, conn)
conn.close()

print(f"\nLoaded Fitness Browser data:")
print(f"  Organism-carbon pairs: {len(fb_data):,}")
print(f"  Unique organisms: {fb_data['orgId'].nunique()}")
print(f"  Unique carbon sources: {fb_data['carbon_source'].nunique()}")

print(f"\nNote: All experiments already pass quality filters (gMed >= 50, mad12 <= 0.5)")

## Match Carbon Source Names

In [None]:
def normalize_carbon_name(name):
    """Normalize carbon source name for matching"""
    if pd.isna(name):
        return name
    
    name = str(name).lower().strip()
    
    # Remove common suffixes
    name = re.sub(r'\s+(monohydrate|dihydrate|trihydrate|pentahydrate|hexahydrate)', '', name)
    name = re.sub(r'\s+(salt|potassium salt|sodium salt|disodium salt|hydrochloride|hcl)', '', name)
    
    # Normalize stereochemistry prefixes
    name = re.sub(r'd-\(-\)-', 'd-', name)
    name = re.sub(r'l-\(-\)-', 'l-', name)
    
    # Remove punctuation
    name = re.sub(r'[,\.]', '', name)
    
    # Normalize whitespace
    name = ' '.join(name.split())
    
    return name

# Create normalized versions for matching
supp_carbons_normalized = {normalize_carbon_name(c): c for c in supp_data.index}
fb_carbons_normalized = {normalize_carbon_name(c): c for c in fb_data['carbon_source'].unique()}

print(f"Normalized carbon source names for matching")
print(f"  Supplementary table: {len(supp_carbons_normalized)} unique names")
print(f"  Fitness Browser: {len(fb_carbons_normalized)} unique names")

## Create Combined Matrix Framework

In [None]:
print("\nCreating combined matrix framework...")

# Get all organisms (57 from Fitness Browser)
all_organisms = metadata['Species_Name'].tolist()

# Get all carbon sources (union of both sources)
all_carbon_sources = sorted(
    set(supp_data.index.tolist()) | 
    set(fb_data['carbon_source'].unique())
)

# Create empty matrix
combined_matrix = pd.DataFrame(
    index=all_carbon_sources,
    columns=all_organisms,
    dtype=str
)

# Fill with empty strings (will use "" for unknown)
combined_matrix[:] = ''

print(f"Created empty matrix:")
print(f"  Shape: {combined_matrix.shape[0]} carbon sources × {combined_matrix.shape[1]} organisms")
print(f"  Total cells: {combined_matrix.size:,}")

## Fill Matrix: Tier 1 (Supplementary Table)

In [None]:
print("\nFilling matrix with Tier 1 data (gold standard)...")

n_filled = 0
for carbon in supp_data.index:
    for organism in supp_data.columns:
        if organism in combined_matrix.columns:
            value = supp_data.loc[carbon, organism]
            if pd.notna(value) and value != '':
                combined_matrix.loc[carbon, organism] = value
                n_filled += 1

print(f"Filled {n_filled:,} cells from Supplementary Table S2")
print(f"  Growth: {(combined_matrix == 'Growth').sum().sum()}")
print(f"  No Growth: {(combined_matrix == 'No Growth').sum().sum()}")

## Fill Matrix: Tier 2 (Fitness Browser - Growth Only)

In [None]:
print("\nFilling matrix with Tier 2 data (Fitness Browser growth calls)...")

n_added = 0
for _, row in fb_data.iterrows():
    org_species = orgid_to_species.get(row['orgId'])
    carbon = row['carbon_source']
    
    if org_species in combined_matrix.columns and carbon in combined_matrix.index:
        # Only add if cell is currently empty (Tier 1 takes precedence)
        if combined_matrix.loc[carbon, org_species] == '':
            combined_matrix.loc[carbon, org_species] = 'Growth'
            n_added += 1

print(f"Added {n_added:,} additional growth calls from Fitness Browser")
print(f"\nFinal matrix composition:")
print(f"  Growth: {(combined_matrix == 'Growth').sum().sum()}")
print(f"  No Growth: {(combined_matrix == 'No Growth').sum().sum()}")
print(f"  Unknown (blank): {(combined_matrix == '').sum().sum()}")

## Save Combined Matrix

In [None]:
print(f"\nSaving combined growth matrix to: {OUTPUT_FILE}")
combined_matrix.to_csv(OUTPUT_FILE)
print(f"Saved {combined_matrix.shape[0]} carbon sources × {combined_matrix.shape[1]} organisms")

## Summary Statistics

In [None]:
print("="*70)
print("COMBINED GROWTH MATRIX SUMMARY")
print("="*70)

print(f"\nData sources (tiered approach):")
print(f"  Tier 1 (Gold Standard): Supplementary Table S2")
print(f"    - Use for: BOTH growth and no-growth")
print(f"    - Coverage: 28 organisms × 94 carbon sources")
print(f"  Tier 2 (Secondary): Fitness Browser")
print(f"    - Use for: Growth calls ONLY")
print(f"    - Coverage: 57 organisms × 198 carbon sources")

print(f"\nMatrix dimensions:")
print(f"  Organisms: {len(combined_matrix.columns)}")
print(f"  Carbon sources: {len(combined_matrix.index)}")
print(f"  Total cells: {combined_matrix.size:,}")

n_growth = (combined_matrix == 'Growth').sum().sum()
n_no_growth = (combined_matrix == 'No Growth').sum().sum()
n_unknown = (combined_matrix == '').sum().sum()

print(f"\nData composition:")
print(f"  Growth: {n_growth:,} ({100*n_growth/combined_matrix.size:.1f}%)")
print(f"  No Growth: {n_no_growth:,} ({100*n_no_growth/combined_matrix.size:.1f}%)")
print(f"  Unknown: {n_unknown:,} ({100*n_unknown/combined_matrix.size:.1f}%)")

print(f"\nInterpretation:")
print(f"  'Growth' = Confirmed growth (high confidence)")
print(f"  'No Growth' = Confirmed no growth (conservative threshold)")
print(f"  '' (blank) = Unknown/not tested")

print(f"\nFiles created:")
print(f"  {OUTPUT_FILE}")

print(f"\nNext step:")
print(f"  Run 04-analyze-data-discrepancies.ipynb to find conflicts")

print("\n" + "="*70)