In [None]:
"""
TB Detection Ablation Study
============================
Experiments:
1. Baseline + Metadata (original approach, optimized)
2. Cough Only (no metadata)
3. Augmented + Metadata
4. Augmented Only (no metadata)

All results, models, and metrics saved systematically.
"""

import numpy as np
import pandas as pd
import librosa
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (recall_score, precision_score, roc_auc_score, 
                            confusion_matrix, accuracy_score, f1_score)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import pickle
import json
import os
from tqdm import tqdm
from datetime import datetime
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login, from_pretrained_keras

try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

# ============================================================================
# CONFIGURATION
# ============================================================================
BASE_PATH = "/kaggle/input/tb-audio/Tuberculosis"
AUDIO_PATH = f"{BASE_PATH}/raw_data/solicited_data"
OUTPUT_DIR = "/kaggle/working/ablation_results"
CLINICAL_PATH = f"{BASE_PATH}/metadata/CODA_TB_Clinical_Meta_Info.csv"
FOLDS = [(f"{BASE_PATH}/metadata/X_train_Fold_{i}.csv", 
          f"{BASE_PATH}/metadata/X_test_Fold_{i}.csv") for i in range(3)]

os.makedirs(OUTPUT_DIR, exist_ok=True)

EXPERIMENTS = {
    'baseline_meta': {'use_metadata': True, 'use_augmentation': False},
    'cough_only': {'use_metadata': False, 'use_augmentation': False},
    'augmented_meta': {'use_metadata': True, 'use_augmentation': True},
    'augmented_only': {'use_metadata': False, 'use_augmentation': True}
}

print("="*80)
print("TB DETECTION - COMPREHENSIVE ABLATION STUDY")
print("="*80)
print(f"Output directory: {OUTPUT_DIR}")
print(f"Timestamp: {datetime.now()}\n")

# ============================================================================
# LOAD HeAR MODEL
# ============================================================================
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HF_TOKEN"))
model = from_pretrained_keras("google/hear")
serving = model.signatures['serving_default']
print("✓ HeAR model loaded\n")

# ============================================================================
# AUDIO PROCESSING FUNCTIONS
# ============================================================================
def load_audio(fname, sr=16000, target_len=32000):
    """Load and normalize audio file."""
    try:
        fpath = os.path.join(AUDIO_PATH, fname)
        audio, _ = librosa.load(fpath, sr=sr, mono=True)
        
        # Normalize
        audio = audio / (np.max(np.abs(audio)) + 1e-8)
        
        # Pad or trim
        if len(audio) < target_len:
            audio = np.pad(audio, (0, target_len - len(audio)))
        else:
            audio = audio[:target_len]
            
        return audio
    except Exception as e:
        print(f"Error loading {fname}: {e}")
        return None

def augment_audio(audio, sr=16000):
    """Apply multiple augmentations to audio for robust training."""
    augmented = []
    
    # Original
    augmented.append(audio)
    
    # Time stretch (0.9x - 1.1x)
    for rate in [0.9, 1.1]:
        try:
            stretched = librosa.effects.time_stretch(audio, rate=rate)
            if len(stretched) < len(audio):
                stretched = np.pad(stretched, (0, len(audio) - len(stretched)))
            else:
                stretched = stretched[:len(audio)]
            augmented.append(stretched)
        except:
            pass
    
    # Pitch shift (-2 to +2 semitones)
    for n_steps in [-2, 2]:
        try:
            shifted = librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
            augmented.append(shifted)
        except:
            pass
    
    # Add noise (SNR ~20-30dB)
    for noise_level in [0.003, 0.005]:
        noisy = audio + noise_level * np.random.randn(len(audio))
        augmented.append(noisy)
    
    # Volume scaling
    for scale in [0.8, 1.2]:
        scaled = audio * scale
        scaled = np.clip(scaled, -1, 1)
        augmented.append(scaled)
    
    return augmented

def extract_embeddings_batch(df, exp_name, use_augmentation=False, batch_size=32):
    """Extract HeAR embeddings with optional augmentation."""
    embeddings, labels, filenames = [], [], []
    batch_audio, batch_label, batch_file = [], [], []
    
    pbar = tqdm(df.iterrows(), total=len(df), desc=f"{exp_name}")
    
    for _, row in pbar:
        audio = load_audio(row['filename'])
        if audio is None:
            continue
        
        audio_samples = augment_audio(audio) if use_augmentation else [audio]
        
        for aug_audio in audio_samples:
            batch_audio.append(aug_audio)
            batch_label.append(row['tb_status'])
            batch_file.append(row['filename'])
            
            if len(batch_audio) >= batch_size:
                emb = serving(x=np.array(batch_audio))['output_0'].numpy()
                embeddings.extend(emb)
                labels.extend(batch_label)
                filenames.extend(batch_file)
                batch_audio, batch_label, batch_file = [], [], []
    
    # Process remaining
    if batch_audio:
        emb = serving(x=np.array(batch_audio))['output_0'].numpy()
        embeddings.extend(emb)
        labels.extend(batch_label)
        filenames.extend(batch_file)
    
    return np.array(embeddings), np.array(labels), filenames

# ============================================================================
# CLINICAL FEATURES
# ============================================================================
def extract_clinical_features(filenames, df_original, clinical_df):
    """Extract clinical metadata features."""
    file_to_participant = dict(zip(df_original['filename'], 
                                   df_original['participant']))
    
    # Compute medians for imputation
    numeric_cols = clinical_df.select_dtypes(include=[np.number])
    medians = numeric_cols.median()
    
    features = []
    for fname in filenames:
        participant_id = file_to_participant.get(fname)
        row = clinical_df[clinical_df['participant'] == participant_id]
        
        if not row.empty:
            r = row.iloc[0]
            age = r['age'] if pd.notna(r['age']) else medians['age']
            height = r['height'] if pd.notna(r['height']) else medians['height']
            weight = r['weight'] if pd.notna(r['weight']) else medians['weight']
            bmi = weight / ((height / 100) ** 2) if height > 0 else 22.0
            heart_rate = r['heart_rate'] if pd.notna(r['heart_rate']) else medians['heart_rate']
            temperature = r['temperature'] if pd.notna(r['temperature']) else medians['temperature']
            sex = 1 if r['sex'] == 'Male' else 0
            
            # Symptom count
            symptom_cols = ['hemoptysis', 'weight_loss', 'fever', 'night_sweats']
            symptoms = sum([1 for col in symptom_cols if r[col] == 'Yes'])
            
            features.append([age, bmi, heart_rate, temperature, sex, symptoms])
        else:
            # Use median defaults
            features.append([medians['age'], 22.0, medians['heart_rate'], 
                           medians['temperature'], 0, 0])
    
    return np.array(features)

def build_features(embeddings, filenames, df_original, clinical_df, use_metadata):
    """Combine embeddings with optional clinical features."""
    if not use_metadata:
        return embeddings
    
    clinical = extract_clinical_features(filenames, df_original, clinical_df)
    
    # Statistical features from embeddings
    emb_mean = embeddings.mean(axis=1, keepdims=True)
    emb_std = embeddings.std(axis=1, keepdims=True)
    emb_max = embeddings.max(axis=1, keepdims=True)
    emb_min = embeddings.min(axis=1, keepdims=True)
    emb_q25 = np.percentile(embeddings, 25, axis=1, keepdims=True)
    emb_q75 = np.percentile(embeddings, 75, axis=1, keepdims=True)
    
    # Clinical interactions
    age_bmi = clinical[:, 0:1] * clinical[:, 1:2]
    symp_bmi = clinical[:, 5:6] * clinical[:, 1:2]
    age_symp = clinical[:, 0:1] * clinical[:, 5:6]
    
    # Embedding-clinical interactions
    mean_age = emb_mean * clinical[:, 0:1]
    std_symp = emb_std * clinical[:, 5:6]
    
    return np.concatenate([
        embeddings, clinical, emb_mean, emb_std, emb_max, emb_min, 
        emb_q25, emb_q75, age_bmi, symp_bmi, age_symp, mean_age, std_symp
    ], axis=1)

# ============================================================================
# MODEL BUILDING
# ============================================================================
def create_ensemble():
    """Create ensemble classifier with multiple algorithms."""
    rf = RandomForestClassifier(
        n_estimators=300, max_depth=12, min_samples_split=8,
        class_weight={0: 1, 1: 3}, random_state=42, n_jobs=-1
    )
    
    gb = GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.03, max_depth=8,
        subsample=0.85, random_state=42
    )
    
    lr = LogisticRegression(
        max_iter=1500, C=0.3, class_weight={0: 1, 1: 4},
        solver='saga', random_state=42
    )
    
    estimators = [('rf', rf), ('gb', gb), ('lr', lr)]
    weights = [2, 2, 1]
    
    if HAS_XGB:
        xgb = XGBClassifier(
            n_estimators=200, learning_rate=0.03, max_depth=8,
            scale_pos_weight=3, subsample=0.85, colsample_bytree=0.85,
            use_label_encoder=False, eval_metric='logloss', 
            random_state=42, n_jobs=-1
        )
        estimators.append(('xgb', xgb))
        weights.append(2)
    
    return VotingClassifier(estimators=estimators, voting='soft', weights=weights)

# ============================================================================
# EVALUATION METRICS
# ============================================================================
def compute_metrics(y_true, y_prob, threshold=0.5):
    """Compute comprehensive metrics."""
    y_pred = (y_prob >= threshold).astype(int)
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'sensitivity': sensitivity,
        'specificity': specificity,
        'precision': precision,
        'npv': npv,
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'auc': roc_auc_score(y_true, y_prob),
        'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn)
    }

def find_optimal_threshold(y_true, y_prob, target_sensitivity=0.90):
    """Find threshold that achieves target sensitivity with max specificity."""
    best_threshold = 0.5
    best_specificity = 0
    
    for threshold in np.sort(y_prob):
        y_pred = (y_prob >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        if sensitivity < target_sensitivity:
            break
        
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        if specificity > best_specificity:
            best_specificity = specificity
            best_threshold = threshold
    
    return best_threshold

# ============================================================================
# TRAINING PIPELINE
# ============================================================================
def train_experiment(exp_name, config, clinical_df):
    """Train a single experiment configuration across all folds."""
    print(f"\n{'='*80}")
    print(f"EXPERIMENT: {exp_name.upper()}")
    print(f"Config: {config}")
    print(f"{'='*80}\n")
    
    exp_dir = os.path.join(OUTPUT_DIR, exp_name)
    os.makedirs(exp_dir, exist_ok=True)
    
    all_y_true, all_y_prob = [], []
    fold_results = []
    
    for fold_idx, (train_path, test_path) in enumerate(FOLDS):
        print(f"\n{'─'*80}")
        print(f"FOLD {fold_idx}")
        print(f"{'─'*80}")
        
        df_train = pd.read_csv(train_path)
        df_test = pd.read_csv(test_path)
        
        # Extract embeddings
        X_train_emb, y_train, files_train = extract_embeddings_batch(
            df_train, f"{exp_name}_F{fold_idx}_train", 
            use_augmentation=config['use_augmentation']
        )
        
        X_test_emb, y_test, files_test = extract_embeddings_batch(
            df_test, f"{exp_name}_F{fold_idx}_test", 
            use_augmentation=False  # Never augment test data
        )
        
        # Build features
        X_train = build_features(X_train_emb, files_train, df_train, 
                                clinical_df, config['use_metadata'])
        X_test = build_features(X_test_emb, files_test, df_test, 
                               clinical_df, config['use_metadata'])
        
        # Standardize
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Balance training data
        smote = SMOTE(sampling_strategy=0.8, k_neighbors=5, random_state=42)
        under = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
        resampler = ImbPipeline([('smote', smote), ('under', under)])
        X_train_balanced, y_train_balanced = resampler.fit_resample(
            X_train_scaled, y_train
        )
        
        print(f"Balanced: TB- = {(y_train_balanced == 0).sum()}, "
              f"TB+ = {(y_train_balanced == 1).sum()}")
        
        # Train model
        clf = create_ensemble()
        clf.fit(X_train_balanced, y_train_balanced)
        
        # Predict
        y_prob = clf.predict_proba(X_test_scaled)[:, 1]
        
        # Store predictions
        all_y_true.extend(y_test)
        all_y_prob.extend(y_prob)
        
        # Fold metrics
        fold_metrics = compute_metrics(y_test, y_prob)
        fold_results.append(fold_metrics)
        
        print(f"Fold {fold_idx} AUC: {fold_metrics['auc']:.4f}")
        
        # Save fold model
        fold_model_path = os.path.join(exp_dir, f'fold_{fold_idx}_model.pkl')
        fold_scaler_path = os.path.join(exp_dir, f'fold_{fold_idx}_scaler.pkl')
        with open(fold_model_path, 'wb') as f:
            pickle.dump(clf, f)
        with open(fold_scaler_path, 'wb') as f:
            pickle.dump(scaler, f)
    
    # Aggregate results
    y_true_all = np.array(all_y_true)
    y_prob_all = np.array(all_y_prob)
    
    overall_metrics = compute_metrics(y_true_all, y_prob_all)
    
    # Find optimal thresholds
    thresholds = {}
    for target_sens in [0.85, 0.90, 0.95]:
        thresh = find_optimal_threshold(y_true_all, y_prob_all, target_sens)
        metrics_at_thresh = compute_metrics(y_true_all, y_prob_all, thresh)
        thresholds[f'sens_{int(target_sens*100)}'] = {
            'threshold': float(thresh),
            'metrics': metrics_at_thresh
        }
    
    # Train final model on all data
    print(f"\n{'─'*80}")
    print("Training final model on all folds...")
    print(f"{'─'*80}")
    
    all_train_dfs = [pd.read_csv(FOLDS[i][0]) for i in range(3)]
    df_all_train = pd.concat(all_train_dfs, ignore_index=True)
    
    X_all_emb, y_all, files_all = extract_embeddings_batch(
        df_all_train, f"{exp_name}_final", 
        use_augmentation=config['use_augmentation']
    )
    
    X_all = build_features(X_all_emb, files_all, df_all_train, 
                          clinical_df, config['use_metadata'])
    
    scaler_final = StandardScaler()
    X_all_scaled = scaler_final.fit_transform(X_all)
    
    smote_final = SMOTE(sampling_strategy=0.8, k_neighbors=5, random_state=42)
    under_final = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
    resampler_final = ImbPipeline([('smote', smote_final), ('under', under_final)])
    X_all_balanced, y_all_balanced = resampler_final.fit_resample(
        X_all_scaled, y_all
    )
    
    clf_final = create_ensemble()
    clf_final.fit(X_all_balanced, y_all_balanced)
    
    # Save final model
    final_model_path = os.path.join(exp_dir, 'final_model.pkl')
    final_scaler_path = os.path.join(exp_dir, 'final_scaler.pkl')
    with open(final_model_path, 'wb') as f:
        pickle.dump(clf_final, f)
    with open(final_scaler_path, 'wb') as f:
        pickle.dump(scaler_final, f)
    
    # Save results
    results = {
        'experiment': exp_name,
        'config': config,
        'overall_metrics': overall_metrics,
        'fold_metrics': fold_results,
        'avg_fold_auc': float(np.mean([f['auc'] for f in fold_results])),
        'optimal_thresholds': thresholds,
        'timestamp': str(datetime.now())
    }
    
    results_path = os.path.join(exp_dir, 'results.json')
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    # Print summary
    print(f"\n{'='*80}")
    print(f"EXPERIMENT {exp_name.upper()} - SUMMARY")
    print(f"{'='*80}")
    print(f"Overall AUC: {overall_metrics['auc']:.4f}")
    print(f"Avg Fold AUC: {results['avg_fold_auc']:.4f}")
    print(f"Sensitivity: {overall_metrics['sensitivity']:.4f}")
    print(f"Specificity: {overall_metrics['specificity']:.4f}")
    print(f"Precision: {overall_metrics['precision']:.4f}")
    print(f"NPV: {overall_metrics['npv']:.4f}")
    print(f"\nOptimal Thresholds:")
    for target, data in thresholds.items():
        m = data['metrics']
        print(f"  {target}: T={data['threshold']:.4f} | "
              f"Sens={m['sensitivity']:.3f} | Spec={m['specificity']:.3f} | "
              f"NPV={m['npv']:.3f}")
    print(f"{'='*80}\n")
    
    return results

# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
    """Run all ablation experiments."""
    clinical_df = pd.read_csv(CLINICAL_PATH)
    
    all_results = {}
    
    for exp_name, config in EXPERIMENTS.items():
        try:
            results = train_experiment(exp_name, config, clinical_df)
            all_results[exp_name] = results
        except Exception as e:
            print(f"\n❌ ERROR in {exp_name}: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    # Save comparative summary
    summary_path = os.path.join(OUTPUT_DIR, 'ablation_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    # Create comparison table
    print(f"\n{'='*80}")
    print("ABLATION STUDY - COMPARATIVE RESULTS")
    print(f"{'='*80}\n")
    
    comparison = []
    for exp_name, results in all_results.items():
        if 'overall_metrics' in results:
            m = results['overall_metrics']
            comparison.append({
                'Experiment': exp_name,
                'AUC': f"{m['auc']:.4f}",
                'Sensitivity': f"{m['sensitivity']:.4f}",
                'Specificity': f"{m['specificity']:.4f}",
                'Precision': f"{m['precision']:.4f}",
                'NPV': f"{m['npv']:.4f}",
                'F1': f"{m['f1']:.4f}"
            })
    
    if comparison:
        df_comparison = pd.DataFrame(comparison)
        print(df_comparison.to_string(index=False))
        
        csv_path = os.path.join(OUTPUT_DIR, 'comparison.csv')
        df_comparison.to_csv(csv_path, index=False)
        print(f"\n✓ Comparison saved to: {csv_path}")
    
    print(f"\n{'='*80}")
    print("✅ ABLATION STUDY COMPLETE")
    print(f"{'='*80}")
    print(f"All results saved to: {OUTPUT_DIR}")
    print(f"  - Individual experiment folders with models and scalers")
    print(f"  - ablation_summary.json (comprehensive results)")
    print(f"  - comparison.csv (quick comparison table)")
    print(f"{'='*80}\n")

if __name__ == "__main__":
    main()

In [4]:
import os

# This lists all "attached" secret labels
secrets_list = [k for k in os.environ.keys() if 'KAGGLE_USER_SECRETS' in k]
print(f"Attached secrets: {secrets_list}")

Attached secrets: ['KAGGLE_USER_SECRETS_TOKEN']


In [3]:
from kaggle_secrets import UserSecretsClient

# Initialize the client
user_secrets = UserSecretsClient()

# Get the secret
try:
    hf_token = user_secrets.get_secret("HF_TOKEN")
    print("Success! Token retrieved.")
except Exception as e:
    print(f"Still failing. Error: {e}")

Success! Token retrieved.
