# Baseline Model Training and Evaluation

This notebook trains and evaluates baseline machine learning models:
- Logistic Regression (L1 regularization)
- Random Forest
- XGBoost
- MLP (Deep Learning)
- 1D-CNN (Deep Learning)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('../src')

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve
)

from preprocessing import MetabolomicsPreprocessor
from features import FeatureSelector
from models.baseline import BaselineModels
from models.deep import MLP, CNN1D, DeepModelTrainer, MetabolomicsDataset
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load and preprocess data
data_path = Path('../data/synthetic/synthetic_urine_metabolomics.csv')
df = pd.read_csv(data_path)

# Prepare labels (binary: control vs cancer)
y = (df['diagnosis_label'] != 'control').astype(int).values
print(f"Class distribution: {np.bincount(y)}")

# Preprocessing
preprocessor = MetabolomicsPreprocessor(
    imputation_method='knn',
    normalization_method='log2',
    batch_correction=True,
    scale_method='zscore'
)

X = preprocessor.fit_transform(df)
print(f"Preprocessed X shape: {X.shape}")

# Feature selection
feature_selector = FeatureSelector(
    method='univariate',
    n_features=min(200, X.shape[1]),
    variance_threshold=0.01
)

X_selected = feature_selector.fit_transform(X, y)
print(f"Selected features: {X_selected.shape[1]}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


In [None]:
# Train baseline models
baseline = BaselineModels()
results = {}

print("Training models...")
print("="*50)

# Logistic Regression
print("\n1. Logistic Regression (L1)")
baseline.train_logistic_regression(X_train, y_train, penalty='l1', C=0.1)
y_pred_lr = baseline.predict('logistic', X_test)
y_proba_lr = baseline.predict_proba('logistic', X_test)
results['logistic'] = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'f1': f1_score(y_test, y_pred_lr, average='weighted', zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba_lr[:, 1])
}
print(f"Accuracy: {results['logistic']['accuracy']:.4f}")

# Random Forest
print("\n2. Random Forest")
baseline.train_random_forest(X_train, y_train, n_estimators=100, max_depth=10)
y_pred_rf = baseline.predict('random_forest', X_test)
y_proba_rf = baseline.predict_proba('random_forest', X_test)
results['random_forest'] = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf, average='weighted', zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba_rf[:, 1])
}
print(f"Accuracy: {results['random_forest']['accuracy']:.4f}")

# XGBoost
print("\n3. XGBoost")
baseline.train_xgboost(X_train, y_train, n_estimators=100, max_depth=6)
y_pred_xgb = baseline.predict('xgboost', X_test)
y_proba_xgb = baseline.predict_proba('xgboost', X_test)
results['xgboost'] = {
    'accuracy': accuracy_score(y_test, y_pred_xgb),
    'f1': f1_score(y_test, y_pred_xgb, average='weighted', zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba_xgb[:, 1])
}
print(f"Accuracy: {results['xgboost']['accuracy']:.4f}")
