### Goal: Compare different classification models in the task to predict whether a user will purchase based on age and estimated salary.

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)

# Load dataset
data = pd.read_csv('../datasets/social-network-ads.csv')

# split into features and target
X = data.iloc[:, :-1].values  # Features: Age, EstimatedSalary
y = data.iloc[:, -1].values  # Target: Purchased

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    

# Prepare scaled versions for models that need it - Logistic Regression, SVM, KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model configs
# What each hyperparameter means:
# - C: Inverse of regularization strength; smaller values specify stronger regularization.
# - max_depth: Maximum depth of the tree; controls overfitting.
# - n_estimators: Number of trees in the forest; more trees can lead to better performance but also longer training time.
# - n_neighbors: Number of neighbors to use for KNN. More neighbors can smooth out noise but may also miss local patterns.
# - kernel: Specifies the kernel type to be used in SVM; 'linear' is a linear kernel, 'rbf' is a radial basis function kernel.
# - probability: If True, enables probability estimates for SVM, which is useful for ROC AUC calculation.
model_configs = [
    {
        'name': 'Logistic Regression',
        'model': LogisticRegression,
        'params': {'C': [0.01, 0.1, 1, 10, 100]},
        'use_scaling': True
    },
    {
        'name': 'Decision Tree',
        'model': DecisionTreeClassifier,
        'params': {'max_depth': [None, 5, 10, 15, 20]},
        'use_scaling': False
    },
    {
        'name': 'Random Forest',
        'model': RandomForestClassifier,
        'params': {'n_estimators': [10, 50, 100, 200]},
        'use_scaling': False
    },
    {
        'name': 'K-Nearest Neighbors',
        'model': KNeighborsClassifier,
        'params': {'n_neighbors': [1, 3, 5, 7, 9]},
        'use_scaling': True
    },
    {
        'name': 'Support Vector Machine',
        'model': SVC,
        'params': {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf'], 'probability': [True]},
        'use_scaling': True
    },
    {
        'name': 'Naive Bayes',
        'model': GaussianNB,
        'params': {},
        'use_scaling': True
    }
]

results = []

for config in model_configs:
    param_grid = list(ParameterGrid(config['params'])) if config['params'] else [{}]
    for params in param_grid:
        # Prepare data
        Xtr = X_train_scaled if config['use_scaling'] else X_train
        Xte = X_test_scaled if config['use_scaling'] else X_test

        # Build and train model
        model = config['model'](**params)
        start_train = time.time()
        model.fit(Xtr, y_train)
        train_time = time.time() - start_train

        # Predict
        start_pred = time.time()
        y_pred = model.predict(Xte)
        pred_time = time.time() - start_pred

        # Probabilities for ROC AUC
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(Xte)[:, 1]
        elif hasattr(model, "decision_function"):
            # For SVM with probability=False
            y_proba = model.decision_function(Xte)
            y_proba = (y_proba - y_proba.min()) / (y_proba.max() - y_proba.min())  # scale to [0,1]
        else:
            y_proba = y_pred  # fallback

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        try:
            auc = roc_auc_score(y_test, y_proba)
        except:
            auc = np.nan
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        results.append({
            'Model': config['name'],
            'Params': params,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1,
            'ROC AUC': auc,
            'Train Time (s)': train_time,
            'Predict Time (s)': pred_time,
            'Confusion Matrix': cm
        })

# Results DataFrame
results_df = pd.DataFrame(results)
# Show only key columns for comparison
display_cols = ['Model', 'Params', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Train Time (s)', 'Predict Time (s)']

# Which should you sort by?
# If your classes are balanced and both errors are equally important:
# Sort by Accuracy or F1 Score.
# If you care more about not missing buyers (recall):
# Sort by Recall or F1 Score.
# If you care more about only targeting likely buyers (precision):
# Sort by Precision.
# If your classes are imbalanced or you want a robust, threshold-independent metric:
# Sort by ROC AUC (recommended in most business cases).

    
# Display results
print("\nClassification Model Comparison Results:\n")
print(results_df[display_cols].sort_values(['Accuracy', 'ROC AUC'], ascending=[False, False]).to_string(index=False))



Classification Model Comparison Results:

                 Model                                               Params  Accuracy  Precision   Recall  F1 Score  ROC AUC  Train Time (s)  Predict Time (s)
           Naive Bayes                                                   {}    0.9375   0.925926 0.892857  0.909091 0.986951        0.000325          0.000113
   K-Nearest Neighbors                                   {'n_neighbors': 7}    0.9375   0.870968 0.964286  0.915254 0.972871        0.000319          0.001940
   K-Nearest Neighbors                                   {'n_neighbors': 9}    0.9375   0.870968 0.964286  0.915254 0.970810        0.000295          0.001857
Support Vector Machine     {'C': 100, 'kernel': 'rbf', 'probability': True}    0.9375   0.870968 0.964286  0.915254 0.947115        0.007400          0.000329
Support Vector Machine     {'C': 0.1, 'kernel': 'rbf', 'probability': True}    0.9250   0.866667 0.928571  0.896552 0.980082        0.004689          0.000546
   