# Supervised Learning - Classification Models

This notebook covers:
- Training multiple classification models
- Model evaluation and comparison
- Performance metrics analysis
- ROC curves and AUC scores


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, confusion_matrix, 
                           classification_report, roc_curve)
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the selected features and target
X_selected = pd.read_csv('data/X_selected.csv')
y = pd.read_csv('data/y_target.csv').values.ravel()

print("Data loaded successfully!")
print(f"Features shape: {X_selected.shape}")
print(f"Target shape: {y.shape}")
print(f"Selected features: {list(X_selected.columns)}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train target distribution: {np.bincount(y_train)}")
print(f"Test target distribution: {np.bincount(y_test)}")


In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Train and evaluate models
results = {}
trained_models = {}

print("Training and Evaluating Models:")
print("=" * 35)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    trained_models[name] = model
    
    # Print results
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    if auc is not None:
        print(f"AUC:       {auc:.4f}")


In [None]:
# Create performance comparison table
print("\nModel Performance Comparison:")
print("=" * 35)

# Create results DataFrame
results_df = pd.DataFrame({
    name: {
        'Accuracy': results[name]['accuracy'],
        'Precision': results[name]['precision'],
        'Recall': results[name]['recall'],
        'F1-Score': results[name]['f1'],
        'AUC': results[name]['auc'] if results[name]['auc'] is not None else 'N/A'
    } for name in results.keys()
}).T

print(results_df.round(4))

# Find best model by F1-score
best_model_name = results_df['F1-Score'].idxmax()
print(f"\nBest model by F1-Score: {best_model_name}")
print(f"Best F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")


In [None]:
# Visualize model performance
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Performance metrics comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.2

for i, (model_name, _) in enumerate(models.items()):
    values = [results_df.loc[model_name, metric] for metric in metrics]
    axes[0, 0].bar(x + i*width, values, width, label=model_name)

axes[0, 0].set_xlabel('Metrics')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Model Performance Comparison')
axes[0, 0].set_xticks(x + width * 1.5)
axes[0, 0].set_xticklabels(metrics)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Confusion matrices
for i, (name, _) in enumerate(models.items()):
    cm = confusion_matrix(y_test, results[name]['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1] if i == 0 else None)
    if i == 0:
        axes[0, 1].set_title(f'Confusion Matrix - {name}')
        axes[0, 1].set_xlabel('Predicted')
        axes[0, 1].set_ylabel('Actual')

# 3. ROC Curves
for name, result in results.items():
    if result['probabilities'] is not None:
        fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
        auc = result['auc']
        axes[1, 0].plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')

axes[1, 0].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('ROC Curves')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature importance (for tree-based models)
if 'Random Forest' in trained_models:
    rf_model = trained_models['Random Forest']
    feature_importance = rf_model.feature_importances_
    feature_names = X_selected.columns
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=True)
    
    axes[1, 1].barh(importance_df['feature'], importance_df['importance'])
    axes[1, 1].set_xlabel('Importance')
    axes[1, 1].set_title('Random Forest Feature Importance')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Cross-validation evaluation
print("Cross-Validation Results:")
print("=" * 25)

cv_results = {}
for name, model in models.items():
    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(model, X_selected, y, cv=5, scoring='f1')
    cv_results[name] = cv_scores
    
    print(f"{name:20}: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Visualize cross-validation results
plt.figure(figsize=(10, 6))
cv_df = pd.DataFrame(cv_results)
cv_df.boxplot()
plt.title('Cross-Validation F1-Scores')
plt.ylabel('F1-Score')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Save the best model
best_model = trained_models[best_model_name]
print(f"\nBest model ({best_model_name}) saved for further use.")
print(f"Final performance on test set:")
print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"F1-Score: {results[best_model_name]['f1']:.4f}")
print(f"AUC: {results[best_model_name]['auc']:.4f}")
