# Model Training and Comparison

This notebook covers:
- Training multiple ML models
- Model comparison and evaluation
- Hyperparameter tuning
- Model selection and saving

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

from train_model import ModelTrainer, OutcomePredictionTrainer, TreatmentRecommendationTrainer
from evaluate_model import ModelEvaluator, OutcomeEvaluator, TreatmentEvaluator

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)

## 1. Load Processed Data

In [None]:
# Load the processed data
X_train = pd.read_csv('../data/processed/X_train_final.csv')
X_test = pd.read_csv('../data/processed/X_test_final.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')['y_train']
y_test = pd.read_csv('../data/processed/y_test.csv')['y_test']

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Target distribution in training set:")
print(y_train.value_counts().sort_index())

# Load feature names
selected_features = pd.read_csv('../data/processed/selected_features.csv')['selected_features'].tolist()
print(f"\nSelected features ({len(selected_features)}): {selected_features}")

## 2. Outcome Prediction Model Training

In [None]:
print("=" * 60)
print("TRAINING OUTCOME PREDICTION MODELS")
print("=" * 60)

# Initialize outcome prediction trainer
outcome_trainer = OutcomePredictionTrainer()

# Train all models
outcome_results, outcome_models = outcome_trainer.train_all_models(X_train, y_train)

# Display results summary
print("\nModel Performance Summary:")
print("-" * 40)
results_df = pd.DataFrame({
    'Model': list(outcome_results.keys()),
    'CV_Mean': [results['cv_mean'] for results in outcome_results.values()],
    'CV_Std': [results['cv_std'] for results in outcome_results.values()]
}).sort_values('CV_Mean', ascending=False)

print(results_df.to_string(index=False))

print(f"\nBest Model: {outcome_trainer.best_model_name}")
print(f"Best CV Score: {outcome_trainer.best_score:.4f}")

In [None]:
# Visualize model comparison
plt.figure(figsize=(15, 10))

# CV scores comparison
plt.subplot(2, 2, 1)
models = list(outcome_results.keys())
cv_means = [outcome_results[model]['cv_mean'] for model in models]
cv_stds = [outcome_results[model]['cv_std'] for model in models]

plt.bar(models, cv_means, yerr=cv_stds, capsize=5)
plt.title('Cross-Validation Scores - Outcome Prediction')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# Box plot of CV scores
plt.subplot(2, 2, 2)
cv_scores_data = [outcome_results[model]['cv_scores'] for model in models]
plt.boxplot(cv_scores_data, labels=models)
plt.title('Cross-Validation Score Distribution')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# Model performance on test set
plt.subplot(2, 2, 3)
test_scores = []
for model_name, model in outcome_models.items():
    test_score = model.score(X_test, y_test)
    test_scores.append(test_score)

plt.bar(models, test_scores)
plt.title('Test Set Performance - Outcome Prediction')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# CV vs Test performance
plt.subplot(2, 2, 4)
plt.scatter(cv_means, test_scores, s=100, alpha=0.7)
for i, model in enumerate(models):
    plt.annotate(model, (cv_means[i], test_scores[i]), xytext=(5, 5), 
                textcoords='offset points', fontsize=8)
plt.plot([min(cv_means), max(cv_means)], [min(cv_means), max(cv_means)], 'r--', alpha=0.5)
plt.xlabel('CV Score')
plt.ylabel('Test Score')
plt.title('CV vs Test Performance')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Print test scores
print("\nTest Set Performance:")
for model_name, score in zip(models, test_scores):
    print(f"{model_name}: {score:.4f}")

## 3. Hyperparameter Tuning

In [None]:
print("\n" + "=" * 60)
print("HYPERPARAMETER TUNING")
print("=" * 60)

# Tune the best performing model
best_model_name = outcome_trainer.best_model_name
print(f"Tuning hyperparameters for: {best_model_name}")

# Perform hyperparameter tuning
tuned_model = outcome_trainer.hyperparameter_tuning(X_train, y_train, best_model_name)

if tuned_model:
    # Evaluate tuned model
    tuned_cv_scores = outcome_trainer.train_single_model(tuned_model, X_train, y_train, f"Tuned {best_model_name}")[1]
    tuned_test_score = tuned_model.score(X_test, y_test)
    
    print(f"\nOriginal {best_model_name} CV Score: {outcome_trainer.best_score:.4f}")
    print(f"Tuned {best_model_name} CV Score: {tuned_cv_scores.mean():.4f}")
    print(f"Improvement: {tuned_cv_scores.mean() - outcome_trainer.best_score:.4f}")
    
    print(f"\nOriginal {best_model_name} Test Score: {outcome_models[best_model_name].score(X_test, y_test):.4f}")
    print(f"Tuned {best_model_name} Test Score: {tuned_test_score:.4f}")
    
    # Update best model if tuned version is better
    if tuned_cv_scores.mean() > outcome_trainer.best_score:
        outcome_trainer.best_model = tuned_model
        outcome_trainer.best_score = tuned_cv_scores.mean()
        print(f"\nUpdated best model to tuned version!")
else:
    print(f"Hyperparameter tuning not available for {best_model_name}")

## 4. Detailed Model Evaluation

In [None]:
print("\n" + "=" * 60)
print("DETAILED MODEL EVALUATION")
print("=" * 60)

# Initialize evaluator
evaluator = OutcomeEvaluator()

# Define class labels
outcome_labels = ['Improved', 'Not Improved', 'Stable']  # Adjust based on your encoded labels

# Evaluate best model
best_model = outcome_trainer.best_model
results, cm, feature_imp = evaluator.generate_evaluation_report(
    best_model, X_test, y_test, selected_features, outcome_labels, 
    f"Best Model: {outcome_trainer.best_model_name}"
)

In [None]:
# Learning curves for best model
print("\nGenerating learning curves...")
evaluator.plot_learning_curves(best_model, X_train, y_train, outcome_trainer.best_model_name)

In [None]:
# Compare all models on test set
print("\nComparing all models on test set...")

all_results = {}
for model_name, model in outcome_models.items():
    model_results = evaluator.evaluate_classification_model(model, X_test, y_test, model_name)
    all_results[model_name] = model_results

# Plot comparison
comparison_df = evaluator.plot_model_comparison(all_results)

## 5. Treatment Recommendation Model Training

In [None]:
# For treatment recommendation, we need to load the full engineered dataset
# and prepare it specifically for treatment recommendation

print("\n" + "=" * 60)
print("PREPARING DATA FOR TREATMENT RECOMMENDATION")
print("=" * 60)

# Load full engineered dataset
df_engineered = pd.read_csv('../data/processed/engineered_data.csv')

# Initialize treatment recommendation trainer
treatment_trainer = TreatmentRecommendationTrainer()

# Prepare treatment recommendation data (exclude outcome from features)
X_treatment, y_treatment = treatment_trainer.prepare_treatment_data(df_engineered)

print(f"Treatment recommendation features: {X_treatment.shape[1]}")
print(f"Treatment target distribution:")
print(y_treatment.value_counts().sort_index())

# Split data
from sklearn.model_selection import train_test_split
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
    X_treatment, y_treatment, test_size=0.2, random_state=42, stratify=y_treatment
)

# Scale features
from sklearn.preprocessing import MinMaxScaler
scaler_t = MinMaxScaler()
X_train_t_scaled = scaler_t.fit_transform(X_train_t)
X_test_t_scaled = scaler_t.transform(X_test_t)

print(f"Treatment training set: {X_train_t_scaled.shape}")
print(f"Treatment test set: {X_test_t_scaled.shape}")

In [None]:
print("\n" + "=" * 60)
print("TRAINING TREATMENT RECOMMENDATION MODELS")
print("=" * 60)

# Train treatment recommendation models
treatment_results, treatment_models = treatment_trainer.train_all_models(X_train_t_scaled, y_train_t)

# Display results
print("\nTreatment Recommendation Model Performance:")
print("-" * 50)
treatment_results_df = pd.DataFrame({
    'Model': list(treatment_results.keys()),
    'CV_Mean': [results['cv_mean'] for results in treatment_results.values()],
    'CV_Std': [results['cv_std'] for results in treatment_results.values()]
}).sort_values('CV_Mean', ascending=False)

print(treatment_results_df.to_string(index=False))

print(f"\nBest Treatment Model: {treatment_trainer.best_model_name}")
print(f"Best CV Score: {treatment_trainer.best_score:.4f}")

In [None]:
# Evaluate treatment recommendation model
treatment_evaluator = TreatmentEvaluator()
treatment_labels = ['Treatment A', 'Treatment B', 'Treatment C', 'Treatment D']  # Adjust based on your data

treatment_eval_results = treatment_evaluator.evaluate_treatment_recommendations(
    treatment_trainer.best_model, X_test_t_scaled, y_test_t, 
    treatment_labels, f"Best Treatment Model: {treatment_trainer.best_model_name}"
)

## 6. Model Saving

In [None]:
print("\n" + "=" * 60)
print("SAVING TRAINED MODELS")
print("=" * 60)

# Save outcome prediction model
outcome_trainer.save_model(outcome_trainer.best_model, '../models/outcome_prediction_model.pkl')

# Save treatment recommendation model
treatment_trainer.save_model(treatment_trainer.best_model, '../models/treatment_recommendation_model.pkl')

# Save scalers
import pickle
with open('../models/outcome_scaler.pkl', 'wb') as f:
    # Note: In a real implementation, you'd save the scaler used for outcome prediction
    pickle.dump(scaler_t, f)  # Using treatment scaler as placeholder

with open('../models/treatment_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_t, f)

# Save model metadata
model_metadata = {
    'outcome_model': {
        'name': outcome_trainer.best_model_name,
        'cv_score': outcome_trainer.best_score,
        'test_score': outcome_trainer.best_model.score(X_test, y_test),
        'features': selected_features
    },
    'treatment_model': {
        'name': treatment_trainer.best_model_name,
        'cv_score': treatment_trainer.best_score,
        'test_score': treatment_trainer.best_model.score(X_test_t_scaled, y_test_t),
        'features': X_treatment.columns.tolist()
    }
}

import json
with open('../models/model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

print("All models and metadata saved successfully!")
print(f"\nModel files saved:")
print(f"- ../models/outcome_prediction_model.pkl")
print(f"- ../models/treatment_recommendation_model.pkl")
print(f"- ../models/outcome_scaler.pkl")
print(f"- ../models/treatment_scaler.pkl")
print(f"- ../models/model_metadata.json")

## 7. Training Summary

In [None]:
print("\n" + "=" * 70)
print("MODEL TRAINING SUMMARY")
print("=" * 70)

print(f"\n1. OUTCOME PREDICTION MODEL:")
print(f"   - Best Model: {outcome_trainer.best_model_name}")
print(f"   - Cross-Validation Score: {outcome_trainer.best_score:.4f}")
print(f"   - Test Set Score: {outcome_trainer.best_model.score(X_test, y_test):.4f}")
print(f"   - Features Used: {len(selected_features)}")

print(f"\n2. TREATMENT RECOMMENDATION MODEL:")
print(f"   - Best Model: {treatment_trainer.best_model_name}")
print(f"   - Cross-Validation Score: {treatment_trainer.best_score:.4f}")
print(f"   - Test Set Score: {treatment_trainer.best_model.score(X_test_t_scaled, y_test_t):.4f}")
print(f"   - Features Used: {X_treatment.shape[1]}")

print(f"\n3. MODEL COMPARISON:")
print(f"   - Models Trained: {len(outcome_models)}")
print(f"   - Algorithms: {', '.join(outcome_models.keys())}")
print(f"   - Hyperparameter Tuning: {'Yes' if tuned_model else 'No'}")

print(f"\n4. EVALUATION METRICS:")
print(f"   - Primary Metric: Accuracy")
print(f"   - Cross-Validation: 5-fold")
print(f"   - Additional Metrics: Precision, Recall, F1-score")

print(f"\n5. NEXT STEPS:")
print(f"   - Models ready for deployment")
print(f"   - Use recommend.py for making predictions")
print(f"   - Proceed to evaluation notebook for detailed analysis")

print(f"\nTraining completed successfully! ðŸŽ‰")