In [1]:
#!pip install hyperopt

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR



#hyperparamiter tuning
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x1051c3a30>

In [3]:
import pandas as pd
import numpy as np
import os
import glob
import shap
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score, pairwise_distances
from sklearn import metrics
#from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

#When using colab...
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/Data/'

In [4]:
df = pd.read_csv('2013-24_model_input_cluster_update_v4.csv')

In [5]:
specific_game_rows = df[df['GAME_ID'] == 21300023]

print(specific_game_rows)

      SEASON_YEAR     TEAM_ID TEAM_ABBREVIATION_x      TEAM_NAME_x   GAME_ID  \
0         2013-14  1610612737                 ATL    Atlanta Hawks  21300023   
20786     2013-14  1610612761                 TOR  Toronto Raptors  21300023   

                 GAME_DATE      MATCHUP  WL  MIN_x  FGM  ...  cluster_1  \
0      2013-11-01T00:00:00  ATL vs. TOR   1   48.0   36  ...   0.041857   
20786  2013-11-01T00:00:00    TOR @ ATL   0   48.0   40  ...   0.104286   

       cluster_2  cluster_3  cluster_4  cluster_5  cluster_6  cluster_7  \
0       0.023857   0.062714   0.047429        0.0        0.0   0.065286   
20786   0.025429   0.042857   0.022571        0.0        0.0   0.024571   

       cluster_8  cluster_9  cluster_10  
0       0.014286   0.023857         0.0  
20786   0.036857   0.011286         0.0  

[2 rows x 109 columns]


In [6]:
rolling_stats = [col for col in df.columns if col.endswith('_rolling')]

# create opponent columns and compute the difference for each rolling stat
for stat in rolling_stats:
    # Shift to get opponent's stats
    df[f'opp_{stat}'] = df.groupby('GAME_ID')[stat].shift(-1)
    df[f'diff_{stat}'] = df[stat] - df[f'opp_{stat}']

# for the second team (reverse the shift), fill in the NaN values
for stat in rolling_stats:
    df[f'opp_{stat}'] = df.groupby('GAME_ID')[f'opp_{stat}'].fillna(df.groupby('GAME_ID')[stat].shift(1))
    df[f'diff_{stat}'] = df[stat] - df[f'opp_{stat}']  # Difference between team's and opponent's rolling stats

# NaN check
print(df.isnull().sum())

SEASON_YEAR                0
TEAM_ID                    0
TEAM_ABBREVIATION_x        0
TEAM_NAME_x                0
GAME_ID                    0
                          ..
diff_PACE_PER40_rolling    0
opp_POSS_rolling           0
diff_POSS_rolling          0
opp_PIE_rolling            0
diff_PIE_rolling           0
Length: 189, dtype: int64


In [None]:
## match up teams and create diff column
# identify the rolling stats columns
rolling_stats = [col for col in df.columns if col.endswith('_rolling')]

cluster_cols = [col for col in df.columns if col.startswith('cluster_')]

for stat in rolling_stats:
    # Shift to get opponent's stats
    df[f'opp_{stat}'] = df.groupby('GAME_ID')[stat].shift(-1)
    df[f'diff_{stat}'] = df[stat] - df[f'opp_{stat}']

for cluster in cluster_cols:
    # Shift to get opponent's clusters
    df[f'opp_{cluster}'] = df.groupby('GAME_ID')[cluster].shift(-1)
    df[f'diff_{cluster}'] = df[cluster] - df[f'opp_{cluster}']

for stat in rolling_stats:
    df[f'opp_{stat}'] = df.groupby('GAME_ID')[f'opp_{stat}'].fillna(df.groupby('GAME_ID')[stat].shift(1))
    df[f'diff_{stat}'] = df[stat] - df[f'opp_{stat}']  

for cluster in cluster_cols:
    df[f'opp_{cluster}'] = df.groupby('GAME_ID')[f'opp_{cluster}'].fillna(df.groupby('GAME_ID')[cluster].shift(1))
    df[f'diff_{cluster}'] = df[cluster] - df[f'opp_{cluster}']

In [None]:
df.columns

In [None]:
columns_to_keep = ['SEASON_YEAR', 'TEAM_ID', 'GAME_ID', 'WL', 'HOME_AWAY', 'win_percentage'] + \
                  [col for col in df.columns if (col.startswith('diff_') and col.endswith('_rolling')) or col.startswith('diff_cluster_')]
df_model = df[columns_to_keep]

df_model.head()

In [None]:
specific_game_rows = df_model[df_model['GAME_ID'] == 21300023]

print(specific_game_rows)

In [None]:
list(df_model.columns)

In [None]:
len(list(df.columns))

In [None]:
# List of columns to keep
columns_to_keep = [
    'SEASON_YEAR',
    'TEAM_ID',
    'GAME_ID',
    'WL',
    'HOME_AWAY',
    'win_percentage',
    'diff_FGM_rolling',
    'diff_FGA_rolling',
    'diff_FG_PCT_rolling',
    'diff_FG3M_rolling',
    'diff_FG3A_rolling',
    'diff_FG3_PCT_rolling',
    'diff_FTM_rolling',
    'diff_FTA_rolling',
    'diff_FT_PCT_rolling',
    'diff_OREB_rolling',
    'diff_DREB_rolling',
    'diff_REB_rolling',
    'diff_AST_rolling',
    'diff_TOV_rolling',
    'diff_STL_rolling',
    'diff_BLK_rolling',
    'diff_BLKA_rolling',
    'diff_PF_rolling',
    'diff_PFD_rolling',
    'diff_PTS_rolling',
    'diff_PLUS_MINUS_rolling',
    'diff_E_OFF_RATING_rolling',
    'diff_OFF_RATING_rolling',
    'diff_E_DEF_RATING_rolling',
    'diff_DEF_RATING_rolling',
    'diff_E_NET_RATING_rolling',
    'diff_NET_RATING_rolling',
    'diff_AST_PCT_rolling',
    'diff_AST_TOV_rolling',
    'diff_AST_RATIO_rolling',
    'diff_E_TM_TOV_PCT_rolling',
    'diff_TM_TOV_PCT_rolling',
    'diff_EFG_PCT_rolling',
    'diff_TS_PCT_rolling',
    'diff_E_USG_PCT_rolling',
    'diff_E_PACE_rolling',
    'diff_PACE_rolling',
    'diff_PACE_PER40_rolling',
    'diff_POSS_rolling',
    'diff_PIE_rolling',
    'diff_cluster_0',
    'diff_cluster_1',
    'diff_cluster_2',
    'diff_cluster_3',
    'diff_cluster_4',
    'diff_cluster_5',
    'diff_cluster_6',
    'diff_cluster_7',
    'diff_cluster_8',
    'diff_cluster_9',
]

df_model_cleaned = df_model[columns_to_keep]

print(df_model_cleaned.head())

In [None]:
df_model_cleaned.columns

# Inferential Stats

In [None]:
from sklearn.feature_selection import f_classif

y = df_model_cleaned[['WL']]
X = df_model_cleaned.drop('WL',axis=1).select_dtypes(include=['float64', 'int64']).drop(['TEAM_ID','GAME_ID'],axis = 1)
f_statistic, p_values = f_classif(X, y)

results_df = pd.DataFrame({
    'Feature': X.columns,
    'F-Statistic': f_statistic,
    'p-Value': p_values
})
results_df = results_df.sort_values(by='F-Statistic', ascending=False)


fig, ax = plt.subplots(1, 1, figsize=(12, 10), dpi=300)

sns.barplot(x='F-Statistic', y='Feature', data=results_df, palette='viridis', ax=ax)
ax.set_title('Feature Importance (F-Statistic)')

plt.show()

# Training and Scaling

In [None]:
null_counts = df_model_cleaned.isnull().sum()
print(null_counts[null_counts > 0])

In [None]:
# final three seasons for the validation set
df_val = df_model_cleaned[df_model_cleaned['SEASON_YEAR'].isin(['2021-22', '2022-23', '2023-24'])]

# remaining seasons for training
df_train_val = df_model_cleaned[~df_model_cleaned['SEASON_YEAR'].isin(['2021-22', '2022-23', '2023-24'])]

X_train = df_train_val.drop(columns=['SEASON_YEAR', 'WL'])
y_train = df_train_val['WL']

X_val = df_val.drop(columns=['SEASON_YEAR', 'WL'])
y_val = df_val['WL']

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
#print(f"Test set shape: {X_test.shape}")

### 80/20 Split
X_train_val = df_model_cleaned.drop(columns=['SEASON_YEAR', 'WL'])
y_train_val = df_model_cleaned['WL']

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

In [None]:
X_train_identifiers = X_train[['TEAM_ID', 'GAME_ID']]
X_val_identifiers = X_val[['TEAM_ID', 'GAME_ID']]
#X_test_identifiers = X_test[['TEAM_ID', 'GAME_ID']]

X_train = X_train.drop(columns=['TEAM_ID', 'GAME_ID'])
X_val = X_val.drop(columns=['TEAM_ID', 'GAME_ID'])
#X_test = X_test.drop(columns=['TEAM_ID', 'GAME_ID'])

In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
#X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
#X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(X_train_scaled_df['diff_PTS_rolling'], kde=True, color='blue')
plt.title('Distribution of PTS_rolling Diff After Robust Scaling')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(X_train_scaled_df['diff_OFF_RATING_rolling'], kde=True, color='blue')
plt.title('Distribution of OFF_RATING_rolling diff After Robust Scaling')
plt.show()

# Model

## Logistic Regression

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
#X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)

log_reg.fit(X_train_scaled_df, y_train)

In [None]:
y_val_pred = log_reg.predict(X_val_scaled_df)

val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("Classification Report:")
print(classification_report(y_val, y_val_pred))

## XGBoost

In [None]:
import xgboost as xgb

# convert the dataset into DMatrix (optimized for XGBoost) -
dtrain = xgb.DMatrix(X_train_scaled_df, label=y_train)
dval = xgb.DMatrix(X_val_scaled_df, label=y_val)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.1,  # learning rate
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

evals = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=10, evals=evals)

y_val_pred = (xgb_model.predict(dval) > 0.5).astype(int)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

In [None]:
param_grid = {
    'max_depth': [5],
    'learning_rate': [0.01],
    'n_estimators': [300],
    'subsample': [0.7],
    'colsample_bytree': [0.8]
}

xgb_clf = xgb.XGBClassifier()
grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled_df, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Validation Accuracy:", grid_search.best_score_)

## Neural Net

In [None]:
#prep data for for network
#convert to tensors
X_train_tensor = torch.tensor(X_train_scaled_df.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val_scaled_df.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)

# combine into tensor dataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Create DataLoader instances
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Shuffle for training, No shuffle for testing
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device('cpu')
# Define the main neural network model
class WinLossNet(nn.Module):
    def __init__(self,layer_sizes, drop_rate = 0.2284):
        super(WinLossNet, self).__init__()

        # main Layers
        self.fc1 = nn.Linear(52, layer_sizes[0])
        self.fc2 = nn.Linear(layer_sizes[0], layer_sizes[1])
        self.dropout = nn.Dropout(p=drop_rate)
        self.fc3 = nn.Linear(layer_sizes[1], layer_sizes[2])
        self.fc4 = nn.Linear(layer_sizes[2], layer_sizes[3])
        self.fc5 = nn.Linear(layer_sizes[3], 1)
        self.leaky_relu = nn.LeakyReLU()

    def forward(self, x):
        # main
        x = self.leaky_relu(self.fc1(x))
        x = self.leaky_relu(self.fc2(x))
        x = self.leaky_relu(self.fc3(x))
        x = self.leaky_relu(self.fc4(x))
        x = self.dropout(x)
        x = self.fc5(x)
        return x

# Instantiate the model, feed weights here to work with search function
model = WinLossNet([512, 256, 128, 64])

#send to device
model = model.to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()   # Combines Sigmoid + BCELoss, feeds data through sigmoid function
optimizer = optim.Adam(model.parameters(), lr=0.006774,weight_decay=0.0073022 )#1e-5)

########even more kaden fun, let


train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []
learning_rates = []
# knobs
epochs = 20
#scheduler = StepLR(optimizer, step_size=17, gamma=.95)  # Change learning rate over time, different options to test
scheduler = ReduceLROnPlateau(optimizer,patience=6,factor=0.25983,cooldown=0)

for epoch in range(epochs):
    model.train()        # Training time
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for i, (inputs, labels) in enumerate(train_loader, 0):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)   #issue ******
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        predicted = (torch.sigmoid(outputs) > 0.5).float()
        total_preds += labels.size(0)
        correct_preds += (predicted == labels).sum().item()

    # Calculte the trainging epocs stats
    epoch_train_loss = running_loss / len(train_loader)
    epoch_train_acc = correct_preds / total_preds
    train_loss_history.append(epoch_train_loss)
    train_acc_history.append(epoch_train_acc)


    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_val_preds = 0
    total_val_preds = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move to device
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total_val_preds += labels.size(0)
            correct_val_preds += (predicted == labels).sum().item()

    #calculate validation stats
    epoch_val_loss = val_loss / len(val_loader)
    epoch_val_acc = correct_val_preds / total_val_preds
    val_loss_history.append(epoch_val_loss)
    val_acc_history.append(epoch_val_acc)

    if epoch > 0:
        print(f'Epoch [{epoch + 1}/{epochs}], '
              f'Training Loss: {epoch_train_loss:.4f}, Training Accuracy: {epoch_train_acc:.4f},'
              f'Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {epoch_val_acc:.4f}')

    # Step the scheduler
    scheduler.step(epoch_val_loss)
    #save chnages in learning rate
    for param_group in optimizer.param_groups:
        learning_rates.append(param_group['lr'])

# Plot plot plot
plt.figure(figsize=(10, 5))
plt.plot(np.arange(1, epochs + 1), train_acc_history, label='Training Accuracy', marker='o')
plt.plot(np.arange(1, epochs + 1), val_acc_history, label='Validation Accuracy', marker='o')
plt.title('Training and Testing Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.legend()
plt.grid(True)
plt.show()


plt.plot(learning_rates)
plt.title('Learning Rate Over Epochs')
plt.xlabel('Iterations')
plt.ylabel('Learning Rate')
plt.grid(True)
plt.show()

In [None]:
torch.save(model.state_dict(), 'model.pth')


#to load model
#model.load_state_dict(torch.load('model.pth'))

In [None]:
# Define the objective function for hyperparameter search, close to hand tuning design above
def objective(params):
    batch_size = params['batch_size']
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = WinLossNet(layer_sizes=params['layer_sizes'],drop_rate=params['drop_rate']).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['lr'],weight_decay=params['weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=params['patience'], factor= params['factor'])

    # Training loop
    model.train()
    for epoch in range(params['epochs']):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    correct, total,val_loss = 0, 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    scheduler.step(avg_val_loss)

    accuracy = correct / total
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the search space
space = {
    'layer_sizes': hp.choice('layer_sizes', [
        [256, 174, 96, 48],
        [256, 128, 96, 32],
        [256, 128, 64, 32],
        [324, 256, 145, 64],
        [512, 256, 128, 64],

    ]),
    'drop_rate': hp.uniform('drop_rate', 0.01, 0.7),
    'lr': hp.loguniform('lr', -5, -1),
    'epochs': hp.choice('epochs', [10,15, 20, 25,30]),
    'patience': hp.choice('patience', [2,3,5,6,7,8,9]),
    'factor' : hp.uniform('factor', 0.1,0.7),
    'weight_decay' : hp.loguniform('weight_decay',-5,-1),
    'batch_size' : hp.choice('batch_size', [16,32,64,128])
}


# Begin the hunt
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=400, trials=trials)

print(best)


In [None]:
 model.eval()

true_values = []
predicted_values = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = torch.sigmoid(model(inputs))
        true_values.append(labels.cpu().numpy())
        predicted_values.append(outputs.cpu().numpy())

true_values = np.concatenate(true_values)
predicted_values = np.concatenate(predicted_values)

results_df = pd.DataFrame({'True_Values':true_values.flatten(),
'pred_Values' : predicted_values.flatten()})
results_df['pred_Values_B'] = (results_df['pred_Values'] > 0.5).astype(int)
results_df = results_df.merge(X_val_identifiers.reset_index().drop('index',axis = 1), left_index=True, right_index=True)
results_df = results_df.merge(X_val.reset_index().drop('index',axis = 1),left_index=True, right_index=True)


In [None]:
results_df['correct_prediction'] = (results_df['True_Values'] == results_df['pred_Values_B']).astype(int)
results_df

In [None]:
final_results = results_df

In [None]:
max(final_results['GAME_ID'].astype(str).str[-4:])

In [None]:
final_results = results_df.copy()
final_results['season'] = final_results['GAME_ID'].astype(str).str[:3]  # First three digits for the season
final_results['game_number'] = final_results['GAME_ID'].astype(str).str[-4:]  # Last three digits for the game number


final_results['correct_prediction'] = (final_results['True_Values'] == final_results['pred_Values_B']).astype(int)
final_results['incorrect_prediction'] = (final_results['True_Values'] != final_results['pred_Values_B']).astype(int)

num_bins = 25
bins = np.linspace(1, 1303, num_bins + 1) # 1303 = max number of games in a seasons
final_results['game_number'] = final_results['game_number'].astype(int)
final_results['binned_game_number'] = pd.cut(final_results['game_number'], bins=bins, include_lowest=True)

binned_summary = final_results.groupby('binned_game_number').agg(
    correct_predictions=('correct_prediction', 'sum'),
    incorrect_predictions=('incorrect_prediction', 'sum')
).reset_index()

# make pretty plot

plt.figure(figsize=(12, 6))
bar_width = 0.35
x = np.arange(len(binned_summary))

plt.bar(x - bar_width/2, binned_summary['correct_predictions'], width=bar_width, label='Correct Predictions', color='skyblue')
plt.bar(x + bar_width/2, binned_summary['incorrect_predictions'], width=bar_width, label='Incorrect Predictions', color='salmon')

plt.title('Predictions Throughout the Season (Binned)')
plt.xlabel('Game Bins (one bin per week of regular season)')
plt.ylabel('Predictions')

plt.legend()
plt.tight_layout()
plt.grid(axis='y')
plt.show()


## Playing with SHAP

In [None]:
# JS visualization for Jupyter
shap.initjs()

explainer = shap.Explainer(xgb_model)
shap_values = explainer.shap_values(X_val_scaled_df)

shap.force_plot(explainer.expected_value, shap_values[0,:], X_val_scaled_df.iloc[0,:])

In [None]:
shap.summary_plot(shap_values, X_val_scaled_df)

## Test: Model Blending

In [None]:
log_reg_pred_proba = log_reg.predict_proba(X_val_scaled_df)[:, 1]

xgb_val_pred_proba = xgb_model.predict(dval) 

model.eval()
nn_pred_proba = []
with torch.no_grad():
    for inputs, _ in val_loader: 
        inputs = inputs.to(device)
        outputs = torch.sigmoid(model(inputs))
        nn_pred_proba.extend(outputs.cpu().numpy()) 
nn_pred_proba = np.array(nn_pred_proba).flatten()

blended_proba = (log_reg_pred_proba + xgb_val_pred_proba + nn_pred_proba) / 3

blended_pred = (blended_proba >= 0.5).astype(int)

blended_accuracy = accuracy_score(y_val, blended_pred)
print(f"Blended Validation Accuracy: {blended_accuracy:.4f}")

print("Blended Confusion Matrix:")
print(confusion_matrix(y_val, blended_pred))

print("Blended Classification Report:")
print(classification_report(y_val, blended_pred))

## AUC for models

In [None]:
log_reg_pred_proba = log_reg.predict_proba(X_val_scaled_df)[:, 1]  
log_reg_auc = roc_auc_score(y_val, log_reg_pred_proba)
print(f"Logistic Regression AUC: {log_reg_auc:.4f}")

xgb_val_pred_proba = xgb_model.predict(dval)  
xgb_auc = roc_auc_score(y_val, xgb_val_pred_proba)
print(f"XGBoost AUC: {xgb_auc:.4f}")

nn_pred_proba = []
with torch.no_grad():
    for inputs, _ in val_loader:
        inputs = inputs.to(device)
        outputs = torch.sigmoid(model(inputs))
        nn_pred_proba.extend(outputs.cpu().numpy())
nn_pred_proba = np.array(nn_pred_proba).flatten()
nn_auc = roc_auc_score(y_val, nn_pred_proba)
print(f"Neural Network AUC: {nn_auc:.4f}")

blended_proba = (log_reg_pred_proba + xgb_val_pred_proba + nn_pred_proba) / 3  # avg prob
blended_auc = roc_auc_score(y_val, blended_proba)
print(f"Blended Model AUC: {blended_auc:.4f}")

In [None]:
log_reg_fpr, log_reg_tpr, _ = roc_curve(y_val, log_reg_pred_proba)
log_reg_auc = auc(log_reg_fpr, log_reg_tpr)

xgb_fpr, xgb_tpr, _ = roc_curve(y_val, xgb_val_pred_proba)
xgb_auc = auc(xgb_fpr, xgb_tpr)

nn_fpr, nn_tpr, _ = roc_curve(y_val, nn_pred_proba)
nn_auc = auc(nn_fpr, nn_tpr)

blended_fpr, blended_tpr, _ = roc_curve(y_val, blended_proba)
blended_auc = auc(blended_fpr, blended_tpr)

fig, axs = plt.subplots(2, 2, figsize=(12, 10))

axs[0, 0].plot(log_reg_fpr, log_reg_tpr, color='blue', label=f'Logistic Regression (AUC = {log_reg_auc:.4f})')
axs[0, 0].plot([0, 1], [0, 1], 'k--', label='Random Guessing (AUC = 0.5)')
axs[0, 0].set_title('Logistic Regression ROC Curve')
axs[0, 0].set_xlabel('False Positive Rate')
axs[0, 0].set_ylabel('True Positive Rate')
axs[0, 0].legend(loc='lower right')
axs[0, 0].grid(True)

axs[0, 1].plot(xgb_fpr, xgb_tpr, color='green', label=f'XGBoost (AUC = {xgb_auc:.4f})')
axs[0, 1].plot([0, 1], [0, 1], 'k--', label='Random Guessing (AUC = 0.5)')
axs[0, 1].set_title('XGBoost ROC Curve')
axs[0, 1].set_xlabel('False Positive Rate')
axs[0, 1].set_ylabel('True Positive Rate')
axs[0, 1].legend(loc='lower right')
axs[0, 1].grid(True)

axs[1, 0].plot(nn_fpr, nn_tpr, color='orange', label=f'Neural Network (AUC = {nn_auc:.4f})')
axs[1, 0].plot([0, 1], [0, 1], 'k--', label='Random Guessing (AUC = 0.5)')
axs[1, 0].set_title('Neural Network ROC Curve')
axs[1, 0].set_xlabel('False Positive Rate')
axs[1, 0].set_ylabel('True Positive Rate')
axs[1, 0].legend(loc='lower right')
axs[1, 0].grid(True)

axs[1, 1].plot(blended_fpr, blended_tpr, color='red', label=f'Blended Model (AUC = {blended_auc:.4f})')
axs[1, 1].plot([0, 1], [0, 1], 'k--', label='Random Guessing (AUC = 0.5)')
axs[1, 1].set_title('Blended Model ROC Curve')
axs[1, 1].set_xlabel('False Positive Rate')
axs[1, 1].set_ylabel('True Positive Rate')
axs[1, 1].legend(loc='lower right')
axs[1, 1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
log_reg_fpr, log_reg_tpr, _ = roc_curve(y_val, log_reg_pred_proba)
log_reg_auc = auc(log_reg_fpr, log_reg_tpr)

xgb_fpr, xgb_tpr, _ = roc_curve(y_val, xgb_val_pred_proba)
xgb_auc = auc(xgb_fpr, xgb_tpr)

nn_fpr, nn_tpr, _ = roc_curve(y_val, nn_pred_proba)
nn_auc = auc(nn_fpr, nn_tpr)

blended_fpr, blended_tpr, _ = roc_curve(y_val, blended_proba)
blended_auc = auc(blended_fpr, blended_tpr)

plt.figure(figsize=(10, 8))
plt.plot(log_reg_fpr, log_reg_tpr, label=f'Logistic Regression (AUC = {log_reg_auc:.4f})', color='blue')
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_auc:.4f})', color='green')
plt.plot(nn_fpr, nn_tpr, label=f'Neural Network (AUC = {nn_auc:.4f})', color='orange')
plt.plot(blended_fpr, blended_tpr, label=f'Blended Model (AUC = {blended_auc:.4f})', color='red')

# 50% random guessing line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing (AUC = 0.5)')

plt.title('ROC Curves for Models')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()