# 02: Model Training

**Student**: Keisuke Nishioka (Matrikelnummer: 10081049)  
**Project**: Stability and Faithfulness Analysis of SHAP Explanations

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from models import (
    train_xgboost, train_random_forest, train_logistic_regression,
    get_task_type, save_model
)
import config

## Load Processed Data

In [None]:
# Load processed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## Train Models with Multiple Random Seeds

In [None]:
# Determine task type
task = get_task_type(y_train)
print(f"Task type: {task}")

### XGBoost Models

In [None]:
xgboost_models = {}

for seed in config.RANDOM_SEEDS:
    print(f"Training XGBoost with seed {seed}...")
    model = train_xgboost(
        X_train, y_train, 
        task=task, 
        random_state=seed
    )
    xgboost_models[seed] = model
    
    # Save model
    save_model(model, f'../results/models/xgboost_seed_{seed}.pkl')

print(f"\nTrained {len(xgboost_models)} XGBoost models")

### Random Forest Models

In [None]:
rf_models = {}

for seed in config.RANDOM_SEEDS:
    print(f"Training Random Forest with seed {seed}...")
    model = train_random_forest(
        X_train, y_train,
        task=task,
        random_state=seed
    )
    rf_models[seed] = model
    
    # Save model
    save_model(model, f'../results/models/random_forest_seed_{seed}.pkl')

print(f"\nTrained {len(rf_models)} Random Forest models")

### Logistic Regression Models

In [None]:
lr_models = {}

for seed in config.RANDOM_SEEDS:
    print(f"Training Logistic Regression with seed {seed}...")
    model = train_logistic_regression(
        X_train, y_train,
        random_state=seed
    )
    lr_models[seed] = model
    
    # Save model
    save_model(model, f'../results/models/logistic_regression_seed_{seed}.pkl')

print(f"\nTrained {len(lr_models)} Logistic Regression models")

## Model Performance Evaluation

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

def evaluate_model(model, X_test, y_test, task='classification'):
    y_pred = model.predict(X_test)
    
    if task == 'classification':
        acc = accuracy_score(y_test, y_pred)
        if hasattr(model, 'predict_proba'):
            y_proba = model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            return {'accuracy': acc, 'auc': auc}
        return {'accuracy': acc}
    else:
        from sklearn.metrics import mean_squared_error, r2_score
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        return {'mse': mse, 'r2': r2}

# Evaluate all models
results = {}
for model_name, models_dict in [('XGBoost', xgboost_models), 
                                 ('Random Forest', rf_models),
                                 ('Logistic Regression', lr_models)]:
    model_results = []
    for seed, model in models_dict.items():
        metrics = evaluate_model(model, X_test, y_test, task=task)
        metrics['seed'] = seed
        model_results.append(metrics)
    results[model_name] = pd.DataFrame(model_results)

# Display results
for model_name, df in results.items():
    print(f"\n{model_name} Performance:")
    print(df.describe())