In [2]:
# AML Project 3 Notebook

In [3]:
# Gradient Boosting

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from scipy.stats import entropy

# Read and clean data set
df = pd.read_csv("cbb.csv")
df = df[df['YEAR'] != '2023']

# Convert non-numeric values in column "POSTSEASON"
convert = {"Champions": 1, "2ND": 2, "F4": 3, "E8": 4, "S16": 5, "R32": 6, "R64": 7, "R68": 8}
df['POSTSEASON'] = df['POSTSEASON'].map(convert)
df.fillna({'POSTSEASON': 9}, inplace = True)

# Define features and target variable
features = ['G', 'W', 'ADJDE', 'BARTHAG', 'TOR', 'TORD',
            'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D', '3P_O', '3P_D', 'ADJ_T', 'WAB', 'SEED']
target = 'POSTSEASON'

# Drop rows with missing values in the target variable
df.dropna(subset=[target], inplace=True)

# Split the data into features and target variable
X = df[features]
y = df[target]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Building the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))  # Gradient Boosting classifier
])

# Training the model
pipeline.fit(X_train, y_train)

# Predicting probabilities and classes for the test data
test_data_df = pd.read_csv("cbb23.csv")

# Ensure consistency of features in test data
test_data_df = test_data_df[X_train.columns]

# Handle missing values in test data
test_data_df = pipeline.named_steps['imputer'].transform(test_data_df)

y_test_pred = pipeline.predict(test_data_df)
y_test_pred_prob = pipeline.predict_proba(test_data_df)

# Output classification report
predicted_classes = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predicted_classes))

# Output confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_classes))

Classification Report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         4
         2.0       0.00      0.00      0.00         3
         3.0       0.33      0.38      0.35         8
         4.0       0.12      0.15      0.13        13
         5.0       0.37      0.24      0.29        29
         6.0       0.34      0.35      0.34        43
         7.0       0.68      0.75      0.71        93
         8.0       0.43      0.38      0.40         8
         9.0       1.00      1.00      1.00       856

    accuracy                           0.90      1057
   macro avg       0.36      0.36      0.36      1057
weighted avg       0.90      0.90      0.90      1057



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
# SVM Model

import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix

# Read Dataset
df = pd.read_csv('cbb.csv')

# Remove rows with year 2023
df = df[df['YEAR'] != 2023]

# Convert non-numeric values in column "POSTSEASON"
convert = {"Champions" : 1, "2ND" : 2, "F4" : 3, "E8" : 4, "S16" : 5, "R32" : 6, "R64" : 7, "R68" : 8}
df['POSTSEASON'] = df['POSTSEASON'].map(convert)
df.fillna({'POSTSEASON': 9}, inplace = True)

# Split dataframe into testing and training data
X = df[['G','W','ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O',
        '3P_D','ADJ_T']]
y = df['POSTSEASON']
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.25, stratify=y)

# Train and fit SVM model        
svm_m = svm.SVC(kernel='rbf', C=100, probability = True, class_weight="balanced", gamma='scale', tol=0.0001)
svm_model = Pipeline ([
    ('scaler', RobustScaler()),
    ('svm', svm_m)
])
svm_model.fit(X_train, y_train)

# Accuracy of model using score and confusion matrix
print("Accuracy of train: ", svm_model.score(X_train, y_train))
print("Accuracy of test : ", svm_model.score(X_test, y_test))

y_pred = svm_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Read and convert test data
s23_df = pd.read_csv('cbb23.csv')
s23_df['POSTSEASON'] = s23_df['POSTSEASON'].map(convert)
s23_df.fillna({'POSTSEASON': 9}, inplace = True)
X_new = s23_df[['G','W','ADJOE','ADJDE','BARTHAG','EFG_O','EFG_D','TOR','TORD','ORB','DRB','FTR','FTRD','2P_O','2P_D','3P_O',
        '3P_D','ADJ_T']]
y_new = s23_df['POSTSEASON']

# Predict POSTSEASON ranking for test data                  
svm_predictions = svm_model.predict(X_new)
svm_probs= svm_predictions.astype(int)
teams = s23_df.loc[y_new.index, 'TEAM']
real_rankings = s23_df.loc[y_new.index, 'POSTSEASON'].astype(int)
svm_predictions_df = pd.DataFrame({'TEAM': teams, 'Predictions': svm_probs, 'Actual': real_rankings})
svm_predictions_df['Predictions'] = svm_predictions_df['Predictions']
svm_predictions_df.to_csv('svm_pred.csv', mode='w', index=False)

# Calculate accuracy and confusion matrix of predictions
accuracy_final_predictions = accuracy_score(y_new, svm_predictions)
print("\nAccuracy of 2023 predictions:", accuracy_final_predictions)
cm_final_predictions = confusion_matrix(y_new, svm_predictions)
print("Confusion Matrix for 2023 predictions:")
print(cm_final_predictions)


Accuracy of train:  0.9873417721518988
Accuracy of test :  0.7835443037974683
Confusion Matrix:
[[  0   0   1   0   0   1   0   0   0]
 [  0   1   0   0   1   0   0   0   0]
 [  1   1   1   0   0   2   0   0   0]
 [  0   0   0   4   3   2   0   0   0]
 [  0   0   0   2   3  12   0   0   1]
 [  0   0   0   3   7   9  12   1   4]
 [  0   0   1   0   4  18  19   0  30]
 [  0   0   0   0   0   0   2   0   7]
 [  0   0   1   1   2   8  38   5 582]]

Accuracy of 2023 predictions: 0.8292011019283747
Confusion Matrix for 2023 predictions:
[[  0   0   1   0   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0]
 [  0   0   1   0   0   1   0   0   0]
 [  0   0   0   1   3   0   0   0   0]
 [  0   0   0   2   3   2   0   0   1]
 [  0   0   0   1   2   8   1   0   4]
 [  0   0   0   0   1   4  15   1  11]
 [  0   0   0   0   0   0   0   0   4]
 [  0   0   0   0   3   4  12   3 273]]


In [5]:
# Neural Network Model