In [None]:
!pip install --upgrade pip
!pip install pandas numpy matplotlib seaborn scikit-learn
!pip install tensorflow tensorflow-gpu
!pip install keras
!pip install plotly
!pip install torch


Collecting tensorflow-gpu
  Using cached tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, classification_report, f1_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function to load and preprocess data
def load_data(train_path, test_path):
    # Load the preprocessed data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target
    X_train = train_data.drop('income', axis=1)
    y_train = (train_data['income'] == '>50K').astype(int)  # Convert to binary (0 for <=50K, 1 for >50K)

    X_test = test_data.drop('income', axis=1)
    y_test = (test_data['income'] == '>50K').astype(int)

    # Identify numeric and categorical columns
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Numeric features: {numeric_features}")
    print(f"Categorical features: {categorical_features}")

    # Create preprocessing pipelines
    # For numeric features: standard scaling
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    # For categorical features: one-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Get feature names after one-hot encoding
    categorical_feature_names = []
    if categorical_features:
        categorical_feature_names = list(
            preprocessor.named_transformers_['cat']['onehot']
            .get_feature_names_out(categorical_features)
        )

    all_feature_names = numeric_features + categorical_feature_names

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_processed).to(device)
    y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1).to(device)
    X_test_tensor = torch.FloatTensor(X_test_processed).to(device)
    y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1).to(device)

    print(f"X_train shape after preprocessing: {X_train_processed.shape}")
    print(f"X_test shape after preprocessing: {X_test_processed.shape}")

    return (X_train_processed, X_test_processed, y_train.values, y_test.values,
            X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor,
            preprocessor, X_train_processed.shape[1])

# Neural Network Model using PyTorch
class IncomeNN(nn.Module):
    def __init__(self, input_size):
        super(IncomeNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        x = self.sigmoid(self.fc4(x))
        return x

# Function to train the PyTorch neural network
def train_nn_model(X_train_tensor, y_train_tensor, X_test_tensor, input_size, batch_size=64, epochs=20):
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize model
    model = IncomeNN(input_size).to(device)

    # Loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print statistics
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

    training_time = time.time() - start_time
    print(f'Training completed in {training_time:.2f} seconds')

    # Evaluation mode
    model.eval()
    with torch.no_grad():
        # Make predictions
        y_pred_proba = model(X_test_tensor)
        y_pred = (y_pred_proba > 0.5).cpu().numpy().astype(int).flatten()

    return model, y_pred, training_time

# Function to evaluate models
def evaluate_model(y_test, y_pred, model_name, training_time=None, pred_time=None):
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Sensitivity (Recall for >50K class or 1)
    sensitivity = recall_score(y_test, y_pred, pos_label=1)

    # Specificity (Recall for <=50K class or 0)
    specificity = recall_score(y_test, y_pred, pos_label=0)

    # F1 score
    f1 = f1_score(y_test, y_pred)

    # Print metrics
    print(f"\n--- {model_name} Results ---")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Sensitivity (Recall for >50K): {sensitivity:.4f}")
    print(f"Specificity (Recall for <=50K): {specificity:.4f}")
    print(f"F1 Score: {f1:.4f}")

    if training_time:
        print(f"Training Time: {training_time:.4f} seconds")
    if pred_time:
        print(f"Prediction Time: {pred_time:.4f} seconds")

    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'F1 Score': f1,
        'Confusion Matrix': cm,
        'Training Time': training_time,
        'Prediction Time': pred_time
    }

# Main execution
def main():
    # Load data with updated paths
    print("Loading and preprocessing data...")
    train_path = "/content/drive/MyDrive/Colab Notebooks/train_preprocessed.csv"
    test_path = "/content/drive/MyDrive/Colab Notebooks/test_preprocessed.csv"

    (X_train_processed, X_test_processed, y_train, y_test,
     X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor,
     preprocessor, input_size) = load_data(train_path, test_path)

    results = []

    # 1. Neural Network with PyTorch (GPU-accelerated)
    print("\nTraining Neural Network model with PyTorch on GPU...")
    nn_model, nn_preds, nn_training_time = train_nn_model(
        X_train_tensor, y_train_tensor, X_test_tensor, input_size, batch_size=128, epochs=20
    )
    nn_results = evaluate_model(y_test, nn_preds, "Neural Network (PyTorch)", training_time=nn_training_time)
    results.append(nn_results)

    # 2. XGBoost with GPU acceleration
    print("\nTraining XGBoost model with GPU acceleration...")
    try:
        start_time = time.time()
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            tree_method='gpu_hist',  # GPU acceleration
            random_state=42
        )
        xgb_model.fit(X_train_processed, y_train)
        xgb_training_time = time.time() - start_time

        # Prediction
        start_time = time.time()
        xgb_preds = xgb_model.predict(X_test_processed)
        xgb_pred_time = time.time() - start_time

        xgb_results = evaluate_model(y_test, xgb_preds, "XGBoost (GPU)",
                                    training_time=xgb_training_time,
                                    pred_time=xgb_pred_time)
        results.append(xgb_results)
    except Exception as e:
        print(f"Error with GPU-accelerated XGBoost: {e}")
        print("Falling back to CPU XGBoost")

        start_time = time.time()
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            tree_method='hist',  # CPU version
            random_state=42
        )
        xgb_model.fit(X_train_processed, y_train)
        xgb_training_time = time.time() - start_time

        # Prediction
        start_time = time.time()
        xgb_preds = xgb_model.predict(X_test_processed)
        xgb_pred_time = time.time() - start_time

        xgb_results = evaluate_model(y_test, xgb_preds, "XGBoost (CPU)",
                                    training_time=xgb_training_time,
                                    pred_time=xgb_pred_time)
        results.append(xgb_results)

    # 3. Random Forest
    print("\nTraining Random Forest model...")
    start_time = time.time()
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        n_jobs=-1,  # Use all CPU cores
        random_state=42
    )
    rf_model.fit(X_train_processed, y_train)
    rf_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    rf_preds = rf_model.predict(X_test_processed)
    rf_pred_time = time.time() - start_time

    rf_results = evaluate_model(y_test, rf_preds, "Random Forest",
                               training_time=rf_training_time,
                               pred_time=rf_pred_time)
    results.append(rf_results)

    # 4. Logistic Regression
    print("\nTraining Logistic Regression model...")
    start_time = time.time()
    lr_model = LogisticRegression(
        C=1.0,
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
    lr_model.fit(X_train_processed, y_train)
    lr_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    lr_preds = lr_model.predict(X_test_processed)
    lr_pred_time = time.time() - start_time

    lr_results = evaluate_model(y_test, lr_preds, "Logistic Regression",
                               training_time=lr_training_time,
                               pred_time=lr_pred_time)
    results.append(lr_results)

    # 5. Lasso Regression
    print("\nTraining Lasso Regression model...")
    start_time = time.time()
    lasso_model = Lasso(
        alpha=0.1,  # Regularization strength
        max_iter=1000,
        random_state=42
    )
    # Lasso expects continuous output, so we'll use a threshold of 0.5 after prediction
    lasso_model.fit(X_train_processed, y_train)
    lasso_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    lasso_preds_raw = lasso_model.predict(X_test_processed)
    lasso_preds = (lasso_preds_raw > 0.5).astype(int)  # Threshold at 0.5
    lasso_pred_time = time.time() - start_time

    lasso_results = evaluate_model(y_test, lasso_preds, "Lasso Regression",
                                  training_time=lasso_training_time,
                                  pred_time=lasso_pred_time)
    results.append(lasso_results)

    # 6. Ridge Regression
    print("\nTraining Ridge Regression model...")
    start_time = time.time()
    ridge_model = Ridge(
        alpha=1.0,  # Regularization strength
        max_iter=1000,
        random_state=42
    )
    # Ridge expects continuous output, so we'll use a threshold of 0.5 after prediction
    ridge_model.fit(X_train_processed, y_train)
    ridge_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    ridge_preds_raw = ridge_model.predict(X_test_processed)
    ridge_preds = (ridge_preds_raw > 0.5).astype(int)  # Threshold at 0.5
    ridge_pred_time = time.time() - start_time

    ridge_results = evaluate_model(y_test, ridge_preds, "Ridge Regression",
                                  training_time=ridge_training_time,
                                  pred_time=ridge_pred_time)
    results.append(ridge_results)

    # 7. CART Decision Tree
    print("\nTraining CART Decision Tree model...")
    start_time = time.time()
    dt_model = DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
    dt_model.fit(X_train_processed, y_train)
    dt_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    dt_preds = dt_model.predict(X_test_processed)
    dt_pred_time = time.time() - start_time

    dt_results = evaluate_model(y_test, dt_preds, "CART Decision Tree",
                               training_time=dt_training_time,
                               pred_time=dt_pred_time)
    results.append(dt_results)

    # 8. SVM
    print("\nTraining SVM model...")
    start_time = time.time()
    svm_model = SVC(
        C=1.0,
        kernel='rbf',
        gamma='scale',
        probability=True,
        random_state=42
    )
    svm_model.fit(X_train_processed, y_train)
    svm_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    svm_preds = svm_model.predict(X_test_processed)
    svm_pred_time = time.time() - start_time

    svm_results = evaluate_model(y_test, svm_preds, "SVM",
                                training_time=svm_training_time,
                                pred_time=svm_pred_time)
    results.append(svm_results)

    # Create comparison DataFrame with F1 scores already included
    comparison_df = pd.DataFrame([
        {'Model': r['Model'],
         'Accuracy': r['Accuracy'],
         'Sensitivity': r['Sensitivity'],
         'Specificity': r['Specificity'],
         'F1 Score': r['F1 Score'],
         'Training Time': r.get('Training Time', 'N/A'),
         'Prediction Time': r.get('Prediction Time', 'N/A')}
        for r in results
    ])

    print("\n--- Model Comparison ---")
    print(comparison_df)

    # Create visualizations for model comparison
    plt.figure(figsize=(12, 6))

    # Accuracy, Sensitivity, Specificity comparison
    metrics_df = comparison_df[['Model', 'Accuracy', 'Sensitivity', 'Specificity', 'F1 Score']]
    metrics_df = pd.melt(metrics_df, id_vars=['Model'], var_name='Metric', value_name='Value')

    plt.subplot(1, 2, 1)
    sns.barplot(x='Model', y='Value', hue='Metric', data=metrics_df)
    plt.title('Model Performance Comparison')
    plt.xticks(rotation=45)
    plt.ylim(0, 1)
    plt.tight_layout()

    # Training Time comparison
    plt.subplot(1, 2, 2)
    time_df = comparison_df[['Model', 'Training Time']].copy()
    time_df = time_df[time_df['Training Time'] != 'N/A']
    time_df['Training Time'] = pd.to_numeric(time_df['Training Time'])

    sns.barplot(x='Model', y='Training Time', data=time_df)
    plt.title('Training Time Comparison (seconds)')
    plt.xticks(rotation=45)
    plt.tight_layout()

    plt.savefig('/content/drive/MyDrive/Colab Notebooks/model_comparison.png')
    plt.close()

    # Find the best model based on different metrics
    best_acc_idx = comparison_df['Accuracy'].idxmax()
    best_acc_model = comparison_df.loc[best_acc_idx]

    best_sens_idx = comparison_df['Sensitivity'].idxmax()
    best_sens_model = comparison_df.loc[best_sens_idx]

    best_spec_idx = comparison_df['Specificity'].idxmax()
    best_spec_model = comparison_df.loc[best_spec_idx]

    best_f1_idx = comparison_df['F1 Score'].idxmax()
    best_f1_model = comparison_df.loc[best_f1_idx]

    # Print the best models
    print(f"\nBest model based on accuracy: {best_acc_model['Model']} with accuracy {best_acc_model['Accuracy']:.4f}")
    print(f"Best model based on sensitivity: {best_sens_model['Model']} with sensitivity {best_sens_model['Sensitivity']:.4f}")
    print(f"Best model based on specificity: {best_spec_model['Model']} with specificity {best_spec_model['Specificity']:.4f}")
    print(f"Best model based on F1 score: {best_f1_model['Model']} with F1 score {best_f1_model['F1 Score']:.4f}")

    # Determine overall best model (based on accuracy as primary metric)
    best_model = best_acc_model
    print(f"\nOVERALL BEST MODEL: {best_model['Model']}")
    print(f"  Accuracy: {best_model['Accuracy']:.4f}")
    print(f"  Sensitivity: {best_model['Sensitivity']:.4f}")
    print(f"  Specificity: {best_model['Specificity']:.4f}")
    print(f"  F1 Score: {best_model['F1 Score']:.4f}")

    # Save the comparison results
    comparison_df.to_csv('/content/drive/MyDrive/Colab Notebooks/model_comparison_results.csv', index=False)

    # Write a summary of results
    with open('/content/drive/MyDrive/Colab Notebooks/model_summary.txt', 'w') as f:
        f.write("Income Prediction Model Comparison\n")
        f.write("=================================\n\n")
        f.write(f"Best model based on accuracy: {best_acc_model['Model']}\n")
        f.write(f"  Accuracy: {best_acc_model['Accuracy']:.4f}\n")
        f.write(f"  Sensitivity: {best_acc_model['Sensitivity']:.4f}\n")
        f.write(f"  Specificity: {best_acc_model['Specificity']:.4f}\n")
        f.write(f"  F1 Score: {best_acc_model['F1 Score']:.4f}\n\n")

        f.write(f"Best model based on F1 score: {best_f1_model['Model']}\n")
        f.write(f"  F1 Score: {best_f1_model['F1 Score']:.4f}\n")
        f.write(f"  Accuracy: {best_f1_model['Accuracy']:.4f}\n")
        f.write(f"  Sensitivity: {best_f1_model['Sensitivity']:.4f}\n")
        f.write(f"  Specificity: {best_f1_model['Specificity']:.4f}\n\n")

        f.write("All Models Performance:\n")
        f.write(comparison_df.to_string())

if __name__ == "__main__":
    main()

Using device: cuda
Loading and preprocessing data...
Numeric features: ['age', 'education-num', 'hours-per-week']
Categorical features: ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
X_train shape after preprocessing: (24147, 85)
X_test shape after preprocessing: (6015, 85)

Training Neural Network model with PyTorch on GPU...
Epoch [5/20], Loss: 0.3515
Epoch [10/20], Loss: 0.3431
Epoch [15/20], Loss: 0.3406
Epoch [20/20], Loss: 0.3374
Training completed in 21.72 seconds

--- Neural Network (PyTorch) Results ---
Confusion Matrix:
[[4072  456]
 [ 571  916]]
Accuracy: 0.8293
Sensitivity (Recall for >50K): 0.6160
Specificity (Recall for <=50K): 0.8993
F1 Score: 0.6408
Training Time: 21.7173 seconds

Training XGBoost model with GPU acceleration...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"




--- XGBoost (GPU) Results ---
Confusion Matrix:
[[4147  381]
 [ 612  875]]
Accuracy: 0.8349
Sensitivity (Recall for >50K): 0.5884
Specificity (Recall for <=50K): 0.9159
F1 Score: 0.6380
Training Time: 0.5591 seconds
Prediction Time: 0.0163 seconds

Training Random Forest model...

--- Random Forest Results ---
Confusion Matrix:
[[4260  268]
 [ 774  713]]
Accuracy: 0.8268
Sensitivity (Recall for >50K): 0.4795
Specificity (Recall for <=50K): 0.9408
F1 Score: 0.5778
Training Time: 1.1428 seconds
Prediction Time: 0.0442 seconds

Training Logistic Regression model...

--- Logistic Regression Results ---
Confusion Matrix:
[[4136  392]
 [ 660  827]]
Accuracy: 0.8251
Sensitivity (Recall for >50K): 0.5562
Specificity (Recall for <=50K): 0.9134
F1 Score: 0.6112
Training Time: 1.5079 seconds
Prediction Time: 0.0015 seconds

Training Lasso Regression model...

--- Lasso Regression Results ---
Confusion Matrix:
[[4528    0]
 [1487    0]]
Accuracy: 0.7528
Sensitivity (Recall for >50K): 0.0000
Speci

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, accuracy_score, recall_score, classification_report,
                            f1_score, precision_score, roc_auc_score, average_precision_score,
                            matthews_corrcoef, balanced_accuracy_score, precision_recall_curve)
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Function to load and preprocess data
def load_data(train_path, test_path):
    # Load the preprocessed data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target
    X_train = train_data.drop('income', axis=1)
    y_train = (train_data['income'] == '>50K').astype(int)  # Convert to binary (0 for <=50K, 1 for >50K)

    X_test = test_data.drop('income', axis=1)
    y_test = (test_data['income'] == '>50K').astype(int)

    # Identify numeric and categorical columns
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Numeric features: {numeric_features}")
    print(f"Categorical features: {categorical_features}")

    # Create preprocessing pipelines
    # For numeric features: standard scaling
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    # For categorical features: one-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Get feature names after one-hot encoding
    categorical_feature_names = []
    if categorical_features:
        categorical_feature_names = list(
            preprocessor.named_transformers_['cat']['onehot']
            .get_feature_names_out(categorical_features)
        )

    all_feature_names = numeric_features + categorical_feature_names

    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train_processed).to(device)
    y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1).to(device)
    X_test_tensor = torch.FloatTensor(X_test_processed).to(device)
    y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1).to(device)

    print(f"X_train shape after preprocessing: {X_train_processed.shape}")
    print(f"X_test shape after preprocessing: {X_test_processed.shape}")

    return (X_train_processed, X_test_processed, y_train.values, y_test.values,
            X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor,
            preprocessor, X_train_processed.shape[1])

# Neural Network Model using PyTorch
class IncomeNN(nn.Module):
    def __init__(self, input_size):
        super(IncomeNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        x = self.sigmoid(self.fc4(x))
        return x

# Function to train the PyTorch neural network
def train_nn_model(X_train_tensor, y_train_tensor, X_test_tensor, input_size, batch_size=64, epochs=20):
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize model
    model = IncomeNN(input_size).to(device)

    # Loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            # Backward and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print statistics
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

    training_time = time.time() - start_time
    print(f'Training completed in {training_time:.2f} seconds')

    # Evaluation mode
    model.eval()
    with torch.no_grad():
        # Make predictions
        y_pred_proba = model(X_test_tensor)
        y_pred = (y_pred_proba > 0.5).cpu().numpy().astype(int).flatten()
        y_pred_proba = y_pred_proba.cpu().numpy().flatten()

    return model, y_pred, y_pred_proba, training_time

# Enhanced function to evaluate models
def evaluate_model(y_test, y_pred, y_pred_proba=None, model_name="Model", training_time=None, pred_time=None):
    # Calculate basic metrics
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    # Precision (Positive Predictive Value)
    precision = precision_score(y_test, y_pred, zero_division=0)

    # Sensitivity/Recall (True Positive Rate)
    sensitivity = recall_score(y_test, y_pred, pos_label=1)

    # Specificity (True Negative Rate)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # F1 score
    f1 = f1_score(y_test, y_pred)

    # Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_test, y_pred)

    # Calculate AUC-ROC and Average Precision (AP) if probability scores are available
    auc_roc = None
    avg_precision = None

    if y_pred_proba is not None:
        try:
            auc_roc = roc_auc_score(y_test, y_pred_proba)
            avg_precision = average_precision_score(y_test, y_pred_proba)
        except Exception as e:
            print(f"Warning: Could not calculate AUC/AP: {e}")

    # Print metrics
    print(f"\n--- {model_name} Results ---")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Sensitivity/Recall: {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")

    if auc_roc is not None:
        print(f"AUC-ROC: {auc_roc:.4f}")
    if avg_precision is not None:
        print(f"Average Precision: {avg_precision:.4f}")

    if training_time:
        print(f"Training Time: {training_time:.4f} seconds")
    if pred_time:
        print(f"Prediction Time: {pred_time:.4f} seconds")

    # Return all metrics as a dictionary
    results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Balanced Accuracy': balanced_acc,
        'Precision': precision,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'F1 Score': f1,
        'MCC': mcc,
        'Confusion Matrix': cm,
        'Training Time': training_time,
        'Prediction Time': pred_time
    }

    if auc_roc is not None:
        results['AUC-ROC'] = auc_roc
    if avg_precision is not None:
        results['AP'] = avg_precision

    return results

# Function to visualize model comparison
def visualize_model_comparison(results, save_path=None):
    # Create comparison DataFrame
    comparison_df = pd.DataFrame([
        {'Model': r['Model'],
         'Accuracy': r['Accuracy'],
         'Balanced Accuracy': r.get('Balanced Accuracy', 'N/A'),
         'Precision': r['Precision'],
         'Sensitivity': r['Sensitivity'],
         'Specificity': r['Specificity'],
         'F1 Score': r['F1 Score'],
         'MCC': r.get('MCC', 'N/A'),
         'AUC-ROC': r.get('AUC-ROC', 'N/A'),
         'AP': r.get('AP', 'N/A'),
         'Training Time': r.get('Training Time', 'N/A'),
         'Prediction Time': r.get('Prediction Time', 'N/A')}
        for r in results
    ])

    print("\n--- Model Comparison ---")
    print(comparison_df)

    # Create visualizations for model comparison
    plt.figure(figsize=(20, 12))

    # Performance metrics comparison
    metrics_to_plot = ['Accuracy', 'Balanced Accuracy', 'Precision', 'Sensitivity',
                      'Specificity', 'F1 Score', 'MCC']

    metrics_df = comparison_df[['Model'] + metrics_to_plot].copy()
    for col in metrics_to_plot:
        metrics_df[col] = pd.to_numeric(metrics_df[col], errors='coerce')

    metrics_df = pd.melt(metrics_df, id_vars=['Model'], var_name='Metric', value_name='Value')

    plt.subplot(2, 2, 1)
    sns.barplot(x='Model', y='Value', hue='Metric', data=metrics_df)
    plt.title('Model Performance Metrics Comparison')
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Training Time comparison
    plt.subplot(2, 2, 2)
    time_df = comparison_df[['Model', 'Training Time']].copy()
    time_df = time_df[time_df['Training Time'] != 'N/A']
    time_df['Training Time'] = pd.to_numeric(time_df['Training Time'])

    sns.barplot(x='Model', y='Training Time', data=time_df)
    plt.title('Training Time Comparison (seconds)')
    plt.xticks(rotation=45, ha='right')

    # Prediction Time comparison
    plt.subplot(2, 2, 3)
    pred_time_df = comparison_df[['Model', 'Prediction Time']].copy()
    pred_time_df = pred_time_df[pred_time_df['Prediction Time'] != 'N/A']
    pred_time_df['Prediction Time'] = pd.to_numeric(pred_time_df['Prediction Time'])

    sns.barplot(x='Model', y='Prediction Time', data=pred_time_df)
    plt.title('Prediction Time Comparison (seconds)')
    plt.xticks(rotation=45, ha='right')

    # Advanced metrics comparison (AUC, AP)
    plt.subplot(2, 2, 4)
    adv_metrics = ['AUC-ROC', 'AP']
    adv_df = comparison_df[['Model'] + adv_metrics].copy()

    # Convert to numeric, handling 'N/A' values
    for col in adv_metrics:
        adv_df[col] = pd.to_numeric(adv_df[col], errors='coerce')

    # Drop rows where all advanced metrics are NaN
    adv_df = adv_df.dropna(subset=adv_metrics, how='all')

    if not adv_df.empty:
        adv_melted = pd.melt(adv_df, id_vars=['Model'], var_name='Metric', value_name='Value')
        sns.barplot(x='Model', y='Value', hue='Metric', data=adv_melted)
        plt.title('Advanced Metrics Comparison (AUC-ROC & Average Precision)')
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 1)
    else:
        plt.text(0.5, 0.5, 'No AUC-ROC or AP data available',
                ha='center', va='center', fontsize=12)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
        print(f"Visualization saved to {save_path}")

    plt.close()

    return comparison_df

# Main execution
def main():
    # Load data with updated paths
    print("Loading and preprocessing data...")
    train_path = "/content/drive/MyDrive/Colab Notebooks/train_preprocessed.csv"
    test_path = "/content/drive/MyDrive/Colab Notebooks/test_preprocessed.csv"

    (X_train_processed, X_test_processed, y_train, y_test,
     X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor,
     preprocessor, input_size) = load_data(train_path, test_path)

    results = []

    # 1. Neural Network with PyTorch (GPU-accelerated)
    print("\nTraining Neural Network model with PyTorch on GPU...")
    nn_model, nn_preds, nn_pred_proba, nn_training_time = train_nn_model(
        X_train_tensor, y_train_tensor, X_test_tensor, input_size, batch_size=128, epochs=20
    )
    nn_results = evaluate_model(y_test, nn_preds, nn_pred_proba, "Neural Network (PyTorch)",
                              training_time=nn_training_time)
    results.append(nn_results)

    # 2. XGBoost with GPU acceleration
    print("\nTraining XGBoost model with GPU acceleration...")
    try:
        start_time = time.time()
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            tree_method='gpu_hist',  # GPU acceleration
            random_state=42
        )
        xgb_model.fit(X_train_processed, y_train)
        xgb_training_time = time.time() - start_time

        # Prediction
        start_time = time.time()
        xgb_preds = xgb_model.predict(X_test_processed)
        xgb_pred_proba = xgb_model.predict_proba(X_test_processed)[:, 1]  # Get positive class probabilities
        xgb_pred_time = time.time() - start_time

        xgb_results = evaluate_model(y_test, xgb_preds, xgb_pred_proba, "XGBoost (GPU)",
                                   training_time=xgb_training_time,
                                   pred_time=xgb_pred_time)
        results.append(xgb_results)
    except Exception as e:
        print(f"Error with GPU-accelerated XGBoost: {e}")
        print("Falling back to CPU XGBoost")

        start_time = time.time()
        xgb_model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            tree_method='hist',  # CPU version
            random_state=42
        )
        xgb_model.fit(X_train_processed, y_train)
        xgb_training_time = time.time() - start_time

        # Prediction
        start_time = time.time()
        xgb_preds = xgb_model.predict(X_test_processed)
        xgb_pred_proba = xgb_model.predict_proba(X_test_processed)[:, 1]
        xgb_pred_time = time.time() - start_time

        xgb_results = evaluate_model(y_test, xgb_preds, xgb_pred_proba, "XGBoost (CPU)",
                                   training_time=xgb_training_time,
                                   pred_time=xgb_pred_time)
        results.append(xgb_results)

    # 3. Random Forest
    print("\nTraining Random Forest model...")
    start_time = time.time()
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        n_jobs=-1,  # Use all CPU cores
        random_state=42
    )
    rf_model.fit(X_train_processed, y_train)
    rf_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    rf_preds = rf_model.predict(X_test_processed)
    rf_pred_proba = rf_model.predict_proba(X_test_processed)[:, 1]
    rf_pred_time = time.time() - start_time

    rf_results = evaluate_model(y_test, rf_preds, rf_pred_proba, "Random Forest",
                              training_time=rf_training_time,
                              pred_time=rf_pred_time)
    results.append(rf_results)

    # 4. Logistic Regression
    print("\nTraining Logistic Regression model...")
    start_time = time.time()
    lr_model = LogisticRegression(
        C=1.0,
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
    lr_model.fit(X_train_processed, y_train)
    lr_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    lr_preds = lr_model.predict(X_test_processed)
    lr_pred_proba = lr_model.predict_proba(X_test_processed)[:, 1]
    lr_pred_time = time.time() - start_time

    lr_results = evaluate_model(y_test, lr_preds, lr_pred_proba, "Logistic Regression",
                              training_time=lr_training_time,
                              pred_time=lr_pred_time)
    results.append(lr_results)

    # 5. Lasso Regression
    print("\nTraining Lasso Regression model...")
    start_time = time.time()
    lasso_model = Lasso(
        alpha=0.1,  # Regularization strength
        max_iter=1000,
        random_state=42
    )
    # Lasso expects continuous output, so we'll use a threshold of 0.5 after prediction
    lasso_model.fit(X_train_processed, y_train)
    lasso_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    lasso_preds_raw = lasso_model.predict(X_test_processed)
    lasso_preds = (lasso_preds_raw > 0.5).astype(int)  # Threshold at 0.5
    lasso_pred_time = time.time() - start_time

    # Clip predictions to be between 0 and 1 for metrics calculation
    lasso_pred_proba = np.clip(lasso_preds_raw, 0, 1)

    lasso_results = evaluate_model(y_test, lasso_preds, lasso_pred_proba, "Lasso Regression",
                                 training_time=lasso_training_time,
                                 pred_time=lasso_pred_time)
    results.append(lasso_results)

    # 6. Ridge Regression
    print("\nTraining Ridge Regression model...")
    start_time = time.time()
    ridge_model = Ridge(
        alpha=1.0,  # Regularization strength
        max_iter=1000,
        random_state=42
    )
    # Ridge expects continuous output, so we'll use a threshold of 0.5 after prediction
    ridge_model.fit(X_train_processed, y_train)
    ridge_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    ridge_preds_raw = ridge_model.predict(X_test_processed)
    ridge_preds = (ridge_preds_raw > 0.5).astype(int)  # Threshold at 0.5
    ridge_pred_time = time.time() - start_time

    # Clip predictions to be between 0 and 1 for metrics calculation
    ridge_pred_proba = np.clip(ridge_preds_raw, 0, 1)

    ridge_results = evaluate_model(y_test, ridge_preds, ridge_pred_proba, "Ridge Regression",
                                 training_time=ridge_training_time,
                                 pred_time=ridge_pred_time)
    results.append(ridge_results)

    # 7. CART Decision Tree
    print("\nTraining CART Decision Tree model...")
    start_time = time.time()
    dt_model = DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
    dt_model.fit(X_train_processed, y_train)
    dt_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    dt_preds = dt_model.predict(X_test_processed)
    dt_pred_proba = dt_model.predict_proba(X_test_processed)[:, 1]
    dt_pred_time = time.time() - start_time

    dt_results = evaluate_model(y_test, dt_preds, dt_pred_proba, "CART Decision Tree",
                              training_time=dt_training_time,
                              pred_time=dt_pred_time)
    results.append(dt_results)

    # 8. SVM
    print("\nTraining SVM model...")
    start_time = time.time()
    svm_model = SVC(
        C=1.0,
        kernel='rbf',
        gamma='scale',
        probability=True,
        random_state=42
    )
    svm_model.fit(X_train_processed, y_train)
    svm_training_time = time.time() - start_time

    # Prediction
    start_time = time.time()
    svm_preds = svm_model.predict(X_test_processed)
    svm_pred_proba = svm_model.predict_proba(X_test_processed)[:, 1]
    svm_pred_time = time.time() - start_time

    svm_results = evaluate_model(y_test, svm_preds, svm_pred_proba, "SVM",
                               training_time=svm_training_time,
                               pred_time=svm_pred_time)
    results.append(svm_results)

    # Create comparison visualization and get DataFrame
    comparison_df = visualize_model_comparison(results,
                                             save_path='/content/drive/MyDrive/Colab Notebooks/enhanced_model_comparison.png')

    # Find the best models based on different metrics
    metrics_to_check = ['Accuracy', 'Balanced Accuracy', 'Precision', 'Sensitivity',
                       'Specificity', 'F1 Score', 'MCC']

    best_models = {}
    for metric in metrics_to_check:
        if metric in comparison_df.columns:
            # Convert to numeric first in case there are 'N/A' values
            comparison_df[metric] = pd.to_numeric(comparison_df[metric], errors='coerce')
            if not comparison_df[metric].isna().all():
                best_idx = comparison_df[metric].idxmax()
                best_models[metric] = {
                    'Model': comparison_df.loc[best_idx, 'Model'],
                    'Value': comparison_df.loc[best_idx, metric]
                }

    # Print the best models
    print("\n--- Best Models by Metric ---")
    for metric, info in best_models.items():
        print(f"Best model based on {metric}: {info['Model']} with {metric} {info['Value']:.4f}")

    # Determine overall best model (using F1 score as primary metric if available)
    overall_best = best_models.get('F1 Score', best_models.get('Accuracy'))
    if overall_best:
        best_model_name = overall_best['Model']
        best_model_idx = comparison_df[comparison_df['Model'] == best_model_name].index[0]
        best_model = comparison_df.loc[best_model_idx]

        print(f"\nOVERALL BEST MODEL: {best_model['Model']}")
        for metric in metrics_to_check:
            if metric in best_model and best_model[metric] != 'N/A':
                print(f"  {metric}: {best_model[metric]:.4f}")

    # Save the comparison results
    comparison_df.to_csv('/content/drive/MyDrive/Colab Notebooks/enhanced_model_comparison_results.csv', index=False)

    # Create precision-recall curves for models with probability outputs
    plt.figure(figsize=(10, 8))

    for result in results:
        model_name = result['Model']
        # Check if we have probability predictions
        if 'AP' in result:
            try:
                # Get the original y_test and y_pred_proba from the result
                model_idx = [i for i, r in enumerate(results) if r['Model'] == model_name][0]

                if model_name == "Neural Network (PyTorch)":
                    y_pred_proba = nn_pred_proba
                elif "XGBoost" in model_name:
                    y_pred_proba = xgb_pred_proba
                elif model_name == "Random Forest":
                    y_pred_proba = rf_pred_proba
                elif model_name == "Logistic Regression":
                    y_pred_proba = lr_pred_proba
                elif model_name == "Lasso Regression":
                    y_pred_proba = lasso_pred_proba
                elif model_name == "Ridge Regression":
                    y_pred_proba = ridge_pred_proba
                elif model_name == "CART Decision Tree":
                    y_pred_proba = dt_pred_proba
                elif model_name == "SVM":
                    y_pred_proba = svm_pred_proba

                precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
                plt.plot(recall, precision, label=f'{model_name} (AP={result["AP"]:.4f})')
            except Exception as e:
                print(f"Error creating PR curve for {model_name}: {e}")

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves')
    plt.legend(loc='best')
    plt.grid(True)
    plt.savefig('/content/drive/MyDrive/Colab Notebooks/precision_recall_curves.png')
    plt.close()

    # Write a comprehensive summary of results
    with open('/content/drive/MyDrive/Colab Notebooks/enhanced_model_summary.txt', 'w') as f:
        f.write("Income Prediction Model Comparison - Enhanced Evaluation\n")
        f.write("====================================================\n\n")

        # Write best models by each metric
        f.write("Best Models by Evaluation Metric:\n")
        f.write("--------------------------------\n")
        for metric, info in best_models.items():
            f.write(f"Best model based on {metric}: {info['Model']} with {metric} {info['Value']:.4f}\n")
        f.write("\n")

        # Write overall best model details
        if overall_best:
            f.write("OVERALL BEST MODEL DETAILS\n")
            f.write("-------------------------\n")
            f.write(f"Model: {best_model['Model']}\n")
            for metric in metrics_to_check:
                if metric in best_model and best_model[metric] != 'N/A':
                    f.write(f"{metric}: {best_model[metric]:.4f}\n")

            # Add training and prediction time if available
            if 'Training Time' in best_model and best_model['Training Time'] != 'N/A':
                f.write(f"Training Time: {best_model['Training Time']:.4f} seconds\n")
            if 'Prediction Time' in best_model and best_model['Prediction Time'] != 'N/A':
                f.write(f"Prediction Time: {best_model['Prediction Time']:.4f} seconds\n")
        f.write("\n")

        # Table of all models and their performance
        f.write("Complete Model Performance Comparison:\n")
        f.write("-----------------------------------\n")
        f.write(comparison_df.to_string(index=False))
        f.write("\n\n")

        # Add interpretation and recommendations
        f.write("Evaluation Metrics Explanation:\n")
        f.write("----------------------------\n")
        f.write("- Accuracy: Overall correctness (correct predictions / total predictions)\n")
        f.write("- Balanced Accuracy: Average of sensitivity and specificity (accounts for class imbalance)\n")
        f.write("- Precision: Positive predictive value (true positives / predicted positives)\n")
        f.write("- Sensitivity/Recall: True positive rate (true positives / actual positives)\n")
        f.write("- Specificity: True negative rate (true negatives / actual negatives)\n")
        f.write("- F1 Score: Harmonic mean of precision and recall (balances both metrics)\n")
        f.write("- MCC: Matthews Correlation Coefficient (balanced measure that works well with imbalanced data)\n")
        f.write("- AUC-ROC: Area Under the ROC Curve (discrimination ability at various thresholds)\n")
        f.write("- AP: Average Precision (area under precision-recall curve)\n")

        f.write("\n\nConclusion and Recommendations:\n")
        f.write("-----------------------------\n")
        if overall_best:
            f.write(f"The {best_model['Model']} model demonstrated the best overall performance for this income prediction task.\n")

            # Add specific recommendations based on the results
            f.write("\nKey observations:\n")
            f.write("1. Performance-speed tradeoff: Neural networks and tree-based models generally performed better but took longer to train.\n")
            f.write("2. Linear models like logistic regression offered reasonable performance with faster training times.\n")
            f.write("3. Consider the most important metric for your specific application when selecting a model.\n")
            f.write("4. For deployment in a production environment, consider both model accuracy and inference speed.\n")

def plot_confusion_matrices(results, save_path='/content/drive/MyDrive/Colab Notebooks/confusion_matrices.png'):
    """
    Create a figure with confusion matrices for all models
    """
    # Determine grid size based on number of models
    n_models = len(results)
    n_cols = min(4, n_models)
    n_rows = (n_models + n_cols - 1) // n_cols

    plt.figure(figsize=(n_cols * 5, n_rows * 4))

    for i, result in enumerate(results):
        plt.subplot(n_rows, n_cols, i + 1)

        cm = result['Confusion Matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)

        # Calculate and display percentages
        cm_sum = np.sum(cm)
        cm_percentages = cm / cm_sum * 100

        # Add text annotations with percentages
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                text_color = 'white' if cm[i, j] > cm.max() / 2 else 'black'
                plt.text(j + 0.5, i + 0.7, f'({cm_percentages[i, j]:.1f}%)',
                        ha='center', va='center', color=text_color, fontsize=9)

        plt.title(result['Model'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()

    plt.savefig(save_path)
    plt.close()
    print(f"Confusion matrices saved to {save_path}")

def plot_feature_importance(X_train_processed, preprocessor, models, save_path='/content/drive/MyDrive/Colab Notebooks/feature_importance.png'):
    """
    Plot feature importance for models that support it
    """
    # Get feature names
    numeric_features = preprocessor.transformers_[0][2]
    categorical_features = preprocessor.transformers_[1][2]

    # Get one-hot encoded feature names
    categorical_feature_names = []
    if len(categorical_features) > 0:
        try:
            categorical_feature_names = list(
                preprocessor.named_transformers_['cat']['onehot']
                .get_feature_names_out(categorical_features)
            )
        except:
            pass

    all_feature_names = list(numeric_features) + list(categorical_feature_names)

    # Models that support feature importance
    importance_models = {
        'xgb_model': 'XGBoost',
        'rf_model': 'Random Forest',
        'dt_model': 'Decision Tree'
    }

    available_models = {name: model for name, model in models.items() if name in importance_models}

    if len(available_models) == 0:
        print("No models with feature importance available")
        return

    n_models = len(available_models)
    plt.figure(figsize=(12, n_models * 6))

    for i, (model_var, model) in enumerate(available_models.items()):
        plt.subplot(n_models, 1, i + 1)

        # Extract feature importance
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
        elif hasattr(model, 'coef_'):
            importances = np.abs(model.coef_[0])
        else:
            continue

        # Ensure we have the right number of feature names
        if len(importances) != len(all_feature_names):
            # If there's a mismatch, create generic feature names
            all_feature_names = [f"Feature {i}" for i in range(len(importances))]

        # Sort features by importance
        indices = np.argsort(importances)[-20:]  # Top 20 features

        plt.barh(range(len(indices)), importances[indices])
        plt.yticks(range(len(indices)), [all_feature_names[i] for i in indices])
        plt.title(f'Top Features - {importance_models[model_var]}')
        plt.xlabel('Importance')
        plt.tight_layout()

    plt.savefig(save_path)
    plt.close()
    print(f"Feature importance plot saved to {save_path}")

if __name__ == "__main__":
    main()

Using device: cuda
Loading and preprocessing data...
Numeric features: ['age', 'education-num', 'hours-per-week']
Categorical features: ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
X_train shape after preprocessing: (24147, 85)
X_test shape after preprocessing: (6015, 85)

Training Neural Network model with PyTorch on GPU...
Epoch [5/20], Loss: 0.3515
Epoch [10/20], Loss: 0.3431
Epoch [15/20], Loss: 0.3406
Epoch [20/20], Loss: 0.3374
Training completed in 16.44 seconds

--- Neural Network (PyTorch) Results ---
Confusion Matrix:
[[4072  456]
 [ 571  916]]
Accuracy: 0.8293
Balanced Accuracy: 0.7576
Precision: 0.6676
Sensitivity/Recall: 0.6160
Specificity: 0.8993
F1 Score: 0.6408
Matthews Correlation Coefficient: 0.5298
AUC-ROC: 0.8830
Average Precision: 0.6995
Training Time: 16.4428 seconds

Training XGBoost model with GPU acceleration...



    E.g. tree_method = "hist", device = "cuda"




--- XGBoost (GPU) Results ---
Confusion Matrix:
[[4147  381]
 [ 612  875]]
Accuracy: 0.8349
Balanced Accuracy: 0.7521
Precision: 0.6967
Sensitivity/Recall: 0.5884
Specificity: 0.9159
F1 Score: 0.6380
Matthews Correlation Coefficient: 0.5352
AUC-ROC: 0.8889
Average Precision: 0.7140
Training Time: 0.4348 seconds
Prediction Time: 0.0197 seconds

Training Random Forest model...

--- Random Forest Results ---
Confusion Matrix:
[[4260  268]
 [ 774  713]]
Accuracy: 0.8268
Balanced Accuracy: 0.7102
Precision: 0.7268
Sensitivity/Recall: 0.4795
Specificity: 0.9408
F1 Score: 0.5778
Matthews Correlation Coefficient: 0.4908
AUC-ROC: 0.8789
Average Precision: 0.6973
Training Time: 0.9476 seconds
Prediction Time: 0.0875 seconds

Training Logistic Regression model...

--- Logistic Regression Results ---
Confusion Matrix:
[[4136  392]
 [ 660  827]]
Accuracy: 0.8251
Balanced Accuracy: 0.7348
Precision: 0.6784
Sensitivity/Recall: 0.5562
Specificity: 0.9134
F1 Score: 0.6112
Matthews Correlation Coeffici