In [9]:
# AML Project 3 Notebook

In [10]:
# Grandient Boosted Model

In [11]:
# SVM Model

import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix

# Read Dataset
df = pd.read_csv('cbb.csv')

# Remove rows with year 2023
df = df[df['YEAR'] != 2023]

# Convert non-numeric values in column "POSTSEASON"
convert = {"Champions" : 1, "2ND" : 2, "F4" : 3, "E8" : 4, "S16" : 5, "R32" : 6, "R64" : 7, "R68" : 8}
df['POSTSEASON'] = df['POSTSEASON'].map(convert)
df.fillna({'POSTSEASON': 9}, inplace = True)

# Split dataframe into testing and training data
X = df[['G','W','ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O',
        '3P_D','ADJ_T']]
y = df['POSTSEASON']
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.25, stratify=y)

# Train and fit SVM model        
svm_m = svm.SVC(kernel='rbf', C=100, probability = True, class_weight="balanced", gamma='scale', tol=0.0001)
svm_model = Pipeline ([
    ('scaler', RobustScaler()),
    ('svm', svm_m)
])
svm_model.fit(X_train, y_train)

# Accuracy of model using score and confusion matrix
print("Accuracy of train: ", svm_model.score(X_train, y_train))
print("Accuracy of test : ", svm_model.score(X_test, y_test))

y_pred = svm_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Read and convert test data
s23_df = pd.read_csv('cbb23.csv')
s23_df['POSTSEASON'] = s23_df['POSTSEASON'].map(convert)
s23_df.fillna({'POSTSEASON': 9}, inplace = True)
X_new = s23_df[['G','W','ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O',
        '3P_D','ADJ_T']]
y_new = s23_df['POSTSEASON']

# Predict POSTSEASON ranking for test data                  
svm_predictions = svm_model.predict(X_new)
svm_probs= svm_predictions.astype(int)
teams = s23_df.loc[y_new.index, 'TEAM']
real_rankings = s23_df.loc[y_new.index, 'POSTSEASON'].astype(int)
svm_predictions_df = pd.DataFrame({'TEAM': teams, 'Predictions': svm_probs, 'Actual': real_rankings})
svm_predictions_df['Predictions'] = svm_predictions_df['Predictions']
svm_predictions_df.to_csv('svm_pred.csv', mode='w', index=False)

# Calculate accuracy and confusion matrix of predictions
accuracy_final_predictions = accuracy_score(y_new, svm_predictions)
print("\nAccuracy of 2023 predictions:", accuracy_final_predictions)
cm_final_predictions = confusion_matrix(y_new, svm_predictions)
print("Confusion Matrix for 2023 predictions:")
print(cm_final_predictions)


Accuracy of train:  0.9843881856540084
Accuracy of test :  0.8075949367088607
Confusion Matrix:
[[  1   0   1   0   0   0   0   0   0]
 [  0   1   0   0   0   1   0   0   0]
 [  1   2   0   0   1   1   0   0   0]
 [  0   0   0   1   5   0   1   0   2]
 [  0   0   0   4   3  10   1   0   0]
 [  0   0   0   5   4   9  14   1   3]
 [  0   0   0   2   3   8  31   2  26]
 [  0   0   0   0   0   0   2   1   6]
 [  0   0   0   0   0  13  29   4 591]]

Accuracy of 2023 predictions: 0.8181818181818182
Confusion Matrix for 2023 predictions:
[[  0   0   1   0   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0]
 [  0   1   0   0   0   0   1   0   0]
 [  0   0   0   1   2   1   0   0   0]
 [  0   0   0   3   3   0   1   0   1]
 [  0   0   0   0   1   5   6   0   4]
 [  0   0   0   1   1   5  15   0  10]
 [  0   0   0   0   0   0   0   1   3]
 [  0   0   1   0   1   4  16   1 272]]


In [12]:
# Neural Network Model