PHASE THREE

In [None]:
#!pip install xgboost lightgbm catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import joblib

# Metrics
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    precision_score, 
    recall_score, 
    roc_auc_score, 
    classification_report, 
    confusion_matrix
)

# Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, 
    ExtraTreesClassifier, 
    AdaBoostClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Helper
from collections import Counter

print("All libraries imported successfully!")

In [None]:
# Load the processed data arrays
# Load the processed data arrays
X_train_resampled = np.load('../data/processed/X_train_resampled.npy', allow_pickle=True).item()
y_train_resampled = np.load('../data/processed/y_train_resampled.npy', allow_pickle=True)
X_test_processed = np.load('../data/processed/X_test_processed.npy', allow_pickle=True).item()
y_test = np.load('../data/processed/y_test.npy', allow_pickle=True)

# Note: y_train and y_test don't need .item() because they were simple 1D arrays.

# Load the label encoder
le = joblib.load('../models/label_encoder.joblib')
labels = le.classes_

print("Data loaded successfully.")
print(f"X_train_resampled shape: {X_train_resampled.shape}")
print(f"y_train_resampled shape: {y_train_resampled.shape}")
print(f"X_test_processed shape: {X_test_processed.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Target labels: {labels}") # Should be ['Dropout', 'Enrolled', 'Graduate']

In [None]:
# Create a dictionary of the models we'll test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SGD Classifier": SGDClassifier(loss='log_loss', random_state=42), # 'log_loss' makes it predict probabilities
    "Support Vector Machine": SVC(probability=True, random_state=42), # probability=True is needed for ROC-AUC
    "Random Forest": RandomForestClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42) # verbose=0 stops it from printing training logs
}

print(f"Defined {len(models)} models.")

In [None]:
results_list = []

for name, model in models.items():
    print(f"--- Training {name} ---")
    start_time = time.time()
    
    # 1. Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # 2. Get predictions on the TEST set
    y_pred = model.predict(X_test_processed)
    
    # 3. Get probability predictions (for ROC-AUC)
    y_proba = model.predict_proba(X_test_processed)
    
    # 4. Calculate metrics
    train_time = time.time() - start_time
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    # ROC-AUC for multi-class
    roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    
    # 5. Store results
    results_list.append({
        "Model": name,
        "F1 (Weighted)": f1,
        "Accuracy": accuracy,
        "ROC-AUC (OVR)": roc_auc,
        "Precision (Weighted)": precision,
        "Recall (Weighted)": recall,
        "Train Time (s)": train_time
    })
    
    print(f"Finished {name} in {train_time:.2f}s.\n")

# Note: SVC might take a few minutes. This is normal.

In [None]:
# Convert the list of results into a DataFrame
results_df = pd.DataFrame(results_list)

# Sort by our most important metric, F1 (Weighted)
results_df = results_df.sort_values(by="F1 (Weighted)", ascending=False)

print("--- Model Comparison ---")
display(results_df)

In [None]:
# Get the name of the best performing model
best_model_name = results_df.iloc[0]['Model']
print(f"--- In-Depth Analysis for Best Model: {best_model_name} ---")

# Get the already-trained model from our dictionary
# Note: In a production environment, you would retrain this model on the *full* dataset
# or on the full resampled training set. For this project, using the one from
# our loop is perfect.
final_model = models[best_model_name]

# Get its predictions again (or store them from the loop)
y_pred_final = final_model.predict(X_test_processed)

# 1. Classification Report
# This shows precision, recall, and f1-score for EACH class
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=labels))

# 2. Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_final)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.title(f'Confusion Matrix for {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Save the best model to the 'models' folder
model_save_path = f"../models/best_model.joblib"
joblib.dump(final_model, model_save_path)

print(f"Best model ({best_model_name}) saved to {model_save_path}")