In [158]:
# ============================
# 1. Import Libraries and Load Data
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, jaccard_score, log_loss


In [None]:
## ============================
# 2. Data Preprocessing
# =============================

# Set pandas option to opt-in to future behavior for downcasting
pd.set_option('future.no_silent_downcasting', True)

# Load the dataset (ensure the path to your CSV is correct)
file_path = 'cbb.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Add a new column 'windex' based on WAB (assuming WAB column exists in your data)
df['windex'] = np.where(df['WAB'] > 7, 'True', 'False')

# Replace 'True'/'False' with numeric values (0 and 1)
df['windex'] = df['windex'].replace({'False': 0, 'True': 1})

# Display the first few rows to check the changes
print(df.head())


             TEAM CONF   G   W  ADJOE  ADJDE  BARTHAG  EFG_O  EFG_D   TOR  \
0  North Carolina  ACC  40  33  123.3   94.9   0.9531   52.6   48.1  15.4   
1       Wisconsin  B10  40  36  129.1   93.6   0.9758   54.8   47.7  12.4   
2        Michigan  B10  40  33  114.4   90.4   0.9375   53.9   47.7  14.0   
3      Texas Tech  B12  38  31  115.2   85.2   0.9696   53.5   43.0  17.7   
4         Gonzaga  WCC  39  37  117.8   86.3   0.9728   56.6   41.1  16.2   

   ...  2P_O  2P_D  3P_O  3P_D  ADJ_T   WAB  POSTSEASON  SEED  YEAR  windex  
0  ...  53.9  44.6  32.7  36.2   71.7   8.6         2ND   1.0  2016       1  
1  ...  54.8  44.7  36.5  37.5   59.3  11.3         2ND   1.0  2015       1  
2  ...  54.7  46.8  35.2  33.2   65.9   6.9         2ND   3.0  2018       0  
3  ...  52.8  41.9  36.5  29.7   67.5   7.0         2ND   3.0  2019       0  
4  ...  56.3  40.0  38.2  29.0   71.5   7.7         2ND   1.0  2017       1  

[5 rows x 25 columns]


In [160]:
# ============================
# 3. Feature Selection
# ============================

# Feature selection
features = ['G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D', 
            'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', 
            '3P_O', '3P_D', 'ADJ_T', 'WAB', 'SEED', 'windex']
X = df1[features]
y = df1['POSTSEASON']


In [161]:
# ============================
# 4. Normalize the Data
# ============================

# Normalize the features
X = preprocessing.StandardScaler().fit_transform(X)


In [162]:
# ============================
# 5. Train/Test Split
# ============================

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)
print(f"Train set: {X_train.shape}, Validation set: {X_val.shape}")


Train set: (112, 21), Validation set: (28, 21)


In [163]:
# ============================
# 6. K-Nearest Neighbors (KNN) Model
# ============================

# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)

# Metrics for KNN
acc_knn = accuracy_score(y_val, y_pred_knn)
f1_knn = f1_score(y_val, y_pred_knn, average='weighted')
jaccard_knn = jaccard_score(y_val, y_pred_knn, average='weighted')

print(f"KNN Accuracy: {acc_knn}")
print(f"KNN F1-Score: {f1_knn}")
print(f"KNN Jaccard Index: {jaccard_knn}")


KNN Accuracy: 0.25
KNN F1-Score: 0.19890873015873017
KNN Jaccard Index: 0.11797026502908857


In [164]:
# ============================
# 7. Decision Tree Model
# ============================

# Decision Tree
tree = DecisionTreeClassifier(max_depth=4)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_val)

# Metrics for Decision Tree
acc_tree = accuracy_score(y_val, y_pred_tree)
f1_tree = f1_score(y_val, y_pred_tree, average='weighted')
jaccard_tree = jaccard_score(y_val, y_pred_tree, average='weighted')

print(f"Decision Tree Accuracy: {acc_tree}")
print(f"Decision Tree F1-Score: {f1_tree}")
print(f"Decision Tree Jaccard Index: {jaccard_tree}")


Decision Tree Accuracy: 0.4642857142857143
Decision Tree F1-Score: 0.32882882882882886
Decision Tree Jaccard Index: 0.23763736263736265


In [165]:
# ============================
# 8. Support Vector Machine (SVM) Model
# ============================

# Support Vector Machine (SVM)
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)

# Metrics for SVM
acc_svm = accuracy_score(y_val, y_pred_svm)
f1_svm = f1_score(y_val, y_pred_svm, average='weighted')
jaccard_svm = jaccard_score(y_val, y_pred_svm, average='weighted')

print(f"SVM Accuracy: {acc_svm}")
print(f"SVM F1-Score: {f1_svm}")
print(f"SVM Jaccard Index: {jaccard_svm}")


SVM Accuracy: 0.39285714285714285
SVM F1-Score: 0.22161172161172163
SVM Jaccard Index: 0.15433673469387754


In [166]:
# ============================
# 9. Logistic Regression Model
# ============================

# Logistic Regression
lr = LogisticRegression(C=0.01, solver='liblinear')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

# Metrics for Logistic Regression
acc_lr = accuracy_score(y_val, y_pred_lr)
f1_lr = f1_score(y_val, y_pred_lr, average='weighted')
jaccard_lr = jaccard_score(y_val, y_pred_lr, average='weighted')

print(f"Logistic Regression Accuracy: {acc_lr}")
print(f"Logistic Regression F1-Score: {f1_lr}")
print(f"Logistic Regression Jaccard Index: {jaccard_lr}")


Logistic Regression Accuracy: 0.5714285714285714
Logistic Regression F1-Score: 0.5240575396825397
Logistic Regression Jaccard Index: 0.3670068027210884


In [167]:
# ============================
# 10. Summary of Results
# ============================

# Report for the models
print("Summary of Model Metrics:")
print(f"KNN Accuracy: {acc_knn}, F1-Score: {f1_knn}, Jaccard Index: {jaccard_knn}")
print(f"Decision Tree Accuracy: {acc_tree}, F1-Score: {f1_tree}, Jaccard Index: {jaccard_tree}")
print(f"SVM Accuracy: {acc_svm}, F1-Score: {f1_svm}, Jaccard Index: {jaccard_svm}")
print(f"Logistic Regression Accuracy: {acc_lr}, F1-Score: {f1_lr}, Jaccard Index: {jaccard_lr}")
print("="*50)


Summary of Model Metrics:
KNN Accuracy: 0.25, F1-Score: 0.19890873015873017, Jaccard Index: 0.11797026502908857
Decision Tree Accuracy: 0.4642857142857143, F1-Score: 0.32882882882882886, Jaccard Index: 0.23763736263736265
SVM Accuracy: 0.39285714285714285, F1-Score: 0.22161172161172163, Jaccard Index: 0.15433673469387754
Logistic Regression Accuracy: 0.5714285714285714, F1-Score: 0.5240575396825397, Jaccard Index: 0.3670068027210884
