In [4]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.preprocessing import label_binarize
from itertools import cycle
import matplotlib.pyplot as plt
import seaborn as sns

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('figures', exist_ok=True)

# Load the data splits
try:
    # First try to load the enhanced data splits
    with open('data/enhanced_train_test_split.pkl', 'rb') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
    print("Successfully loaded enhanced training and testing data")
except FileNotFoundError:
    try:
        # If enhanced splits don't exist, try the regular splits
        with open('data/train_test_split.pkl', 'rb') as f:
            X_train, X_test, y_train, y_test = pickle.load(f)
        print("Successfully loaded training and testing data")
    except FileNotFoundError:
        print("Error: No train_test_split.pkl file found. Make sure you've run the data preprocessing script first.")
        raise

# Check unique classes in the target variable
unique_classes = np.unique(y_train)
num_classes = len(unique_classes)
print(f"Number of unique classes in target: {num_classes}")
print(f"Classes: {unique_classes}")

# Check if binary or multiclass
is_binary = num_classes == 2
print(f"Binary classification task: {is_binary}")

# Define models with hyperparameter grids
model_params = {
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1]
        }
    }
}

# Train and evaluate each model with hyperparameter tuning
best_models = {}
for name, mp in model_params.items():
    print(f"\nTraining {name}...")
    
    # Grid search
    grid_search = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    
    # Evaluate
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    
    # Store results
    best_models[name] = {
        'model': best_model,
        'accuracy': accuracy,
        'report': report,
        'best_params': grid_search.best_params_,
        'predictions': y_pred
    }
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}\n")
    
    # Confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'figures/advanced_confusion_matrix_{name.replace(" ", "_")}.png')
    plt.close()

# If binary classification, plot ROC curves
if is_binary:
    plt.figure(figsize=(10, 8))
    
    for name, results in best_models.items():
        model = results['model']
        y_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        
        # Store ROC data
        best_models[name]['roc_auc'] = roc_auc
        
        # Plot ROC curve
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.3f})")
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Different Models')
    plt.legend(loc='lower right')
    plt.savefig('figures/advanced_roc_curves.png')
    plt.close()
    
    # Select best model based on ROC AUC
    best_model_name = max(best_models, key=lambda k: best_models[k].get('roc_auc', 0))
    
    print(f"\nBest model (based on ROC AUC): {best_model_name}")
    print(f"Parameters: {best_models[best_model_name]['best_params']}")
    print(f"Accuracy: {best_models[best_model_name]['accuracy']:.4f}")
    print(f"ROC AUC: {best_models[best_model_name]['roc_auc']:.4f}")
else:
    # For multiclass, use accuracy as the primary metric
    # Plot precision-recall curves for each class (if desired)
    # This code is commented out as it's more complex and may not be needed
    """
    # Binarize the output for precision-recall curve
    y_test_bin = label_binarize(y_test, classes=unique_classes)
    
    # Plot precision-recall curve for each class
    plt.figure(figsize=(12, 9))
    
    for name, results in best_models.items():
        model = results['model']
        y_score = model.predict_proba(X_test)
        
        # For each class
        precision = dict()
        recall = dict()
        avg_precision = dict()
        
        for i, class_id in enumerate(unique_classes):
            precision[i], recall[i], _ = precision_recall_curve(
                y_test_bin[:, i] if y_test_bin.ndim > 1 else (y_test == class_id).astype(int),
                y_score[:, i]
            )
            avg_precision[i] = average_precision_score(
                y_test_bin[:, i] if y_test_bin.ndim > 1 else (y_test == class_id).astype(int),
                y_score[:, i]
            )
        
        # Compute micro-average PR curve
        precision["micro"], recall["micro"], _ = precision_recall_curve(
            y_test_bin.ravel() if y_test_bin.ndim > 1 else y_test,
            np.concatenate([y_score[:, i:i+1] for i in range(len(unique_classes))], axis=1).ravel()
        )
        
        avg_precision["micro"] = average_precision_score(
            y_test_bin, y_score, average="micro"
        ) if y_test_bin.ndim > 1 else average_precision_score(
            (y_test[:, np.newaxis] == unique_classes).astype(int), y_score, average="micro"
        )
        
        plt.plot(recall["micro"], precision["micro"],
                 label=f'{name} (AP = {avg_precision["micro"]:0.2f})')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves (Micro-Average)')
    plt.legend(loc="best")
    plt.savefig('figures/advanced_precision_recall_curves.png')
    plt.close()
    """
    
    # Select best model based on accuracy
    best_model_name = max(best_models, key=lambda k: best_models[k]['accuracy'])
    
    print(f"\nBest model (based on accuracy): {best_model_name}")
    print(f"Parameters: {best_models[best_model_name]['best_params']}")
    print(f"Accuracy: {best_models[best_model_name]['accuracy']:.4f}")

# Create a comparison plot of model accuracies
plt.figure(figsize=(10, 6))
model_names = list(best_models.keys())
accuracies = [best_models[name]['accuracy'] for name in model_names]

bars = plt.bar(model_names, accuracies, color=['blue', 'green', 'purple'])
plt.title('Advanced Model Accuracy Comparison', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.ylim(0, 1.0)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add accuracy values on top of bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{accuracy:.4f}', ha='center', fontsize=12)

plt.savefig('figures/advanced_model_accuracy_comparison.png')
plt.close()

# Save the best performing model
best_model = best_models[best_model_name]['model']
with open('models/best_advanced_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nBest advanced model saved to models/best_advanced_model.pkl")

# Save all models
for name, result in best_models.items():
    with open(f'models/advanced_{name.replace(" ", "_").lower()}_model.pkl', 'wb') as f:
        pickle.dump(result['model'], f)
    print(f"Saved {name} model to models/advanced_{name.replace(' ', '_').lower()}_model.pkl")

print("\nAdvanced model training and evaluation completed successfully!")

Successfully loaded enhanced training and testing data
Number of unique classes in target: 5
Classes: [0 1 2 3 4]
Binary classification task: False

Training SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.5574
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.97      0.82        29
           1       0.33      0.17      0.22        12
           2       0.33      0.22      0.27         9
           3       0.20      0.29      0.24         7
           4       0.00      0.00      0.00         4

    accuracy                           0.56        61
   macro avg       0.32      0.33      0.31        61
weighted avg       0.48      0.56      0.50        61



Training Gradient Boosting...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Accuracy:



Best parameters: {'learning_rate': 0.01, 'n_estimators': 100}
Accuracy: 0.5246
Classification Report:
              precision    recall  f1-score   support

           0       0.58      1.00      0.73        29
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         9
           3       0.27      0.43      0.33         7
           4       0.00      0.00      0.00         4

    accuracy                           0.52        61
   macro avg       0.17      0.29      0.21        61
weighted avg       0.31      0.52      0.39        61



Best model (based on accuracy): SVM
Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.5574

Best advanced model saved to models/best_advanced_model.pkl
Saved SVM model to models/advanced_svm_model.pkl
Saved Gradient Boosting model to models/advanced_gradient_boosting_model.pkl
Saved AdaBoost model to models/advanced_adaboost_model.pkl

Advanced model training and evaluation complet