In [None]:
import pandas as pd
import numpy as np
import random
import scipy

import matplotlib.pyplot as plt
from matplotlib import pyplot

import sklearn
from sklearn.metrics import plot_confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_curve, f1_score, precision_score, recall_score


In [None]:
# load the data

df = pd.read_csv(
    '/Users/harry/CP/Study/MachineLearning/NBA-prediction/base_data/gameResults2015-20withMisPer100Player_for_use.csv',
    index_col=0)

In [None]:
# prepare the data

from sklearn.model_selection import train_test_split

X = df.drop('result', axis=1)
y = df.result

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

## KNN

In [None]:
# fit the knn model
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X_train_std, y_train)

In [None]:
# print model performance
print(neigh.score(X_train_std, y_train))
print(neigh.score(X_test_std, y_test))
y_pred = neigh.predict(X_test_std)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
# plot the confusion matrix
plot_confusion_matrix(neigh, X_test_std, y_test, cmap=plt.cm.Blues)
plot_confusion_matrix(neigh, X_test_std, y_test, normalize='true', cmap=plt.cm.Blues)
plt.show()

In [None]:
# ROC curve and auc

# predict probabilities
neigh_probs = neigh.predict_proba(X_test_std)
neigh_probs = neigh_probs[:, 1]

# calculate scores
ns_probs = [0 for i in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
neigh_auc = roc_auc_score(y_test, neigh_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('KNN: ROC AUC=%.3f' % (neigh_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
neigh_fpr, neigh_tpr, _ = roc_curve(y_test, neigh_probs)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(neigh_fpr, neigh_tpr, marker='.', label='KNN')

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curve of KNN')
pyplot.legend(loc='lower right')
pyplot.show()

## Logistic Regression

In [None]:
# fit the logistic regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=33, max_iter=1000)
lr.fit(X_train_std, y_train)

In [None]:
# print model performance
print(lr.score(X_train_std, y_train))
print(lr.score(X_test_std, y_test))
print(lr.score(X_combined_std, y_combined))
y_pred = lr.predict(X_test_std)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
# plot the confusion matrix
plt.figure(figsize=[8,6],dpi=150)
plot_confusion_matrix(lr, X_test_std, y_test, cmap=plt.cm.Blues)
plot_confusion_matrix(lr, X_test_std, y_test, normalize='true', cmap=plt.cm.Blues)
plt.show()

In [None]:
# ROC curve

# predict probabilities
lr_probs = lr.predict_proba(X_test_std)
lr_probs = lr_probs[:, 1]

# calculate scores
ns_probs = [0 for i in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curve of logistic regression')
pyplot.legend(loc='lower right')
pyplot.show()

In [None]:
# Precision Recall curve

lr_probs = lr.predict_proba(X_test_std)
lr_probs = lr_probs[:, 1]

y_hat = lr.predict(X_test_std)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1, lr_auc = f1_score(y_test, y_hat), auc(lr_recall, lr_precision)

print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))

no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')

pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
pyplot.show()

## Random Forest

Tentative model, not for final result.

In [None]:
from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(random_state=33,**random_search.best_params_)
# below the best_params_ are obtained from randomizedm search
rfc = RandomForestClassifier(random_state=33,max_depth=6, max_features='log2', n_estimators=150)
rfc.fit(X_train_std, y_train)

In [None]:
# print model performance
print(rfc.score(X_train_std, y_train))
print(rfc.score(X_test_std, y_test))
y_pred = rfc.predict(X_test_std)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
# plot the confusion matrix
# plot_confusion_matrix(rfc, X_test_std, y_test, cmap=plt.cm.Blues)
plot_confusion_matrix(rfc, X_test_std, y_test, normalize='true', display_labels=['Loss','Win'],
                      cmap=plt.cm.Blues)
plt.show()

In [None]:
# get the feature importances
importances_rfc = rfc.feature_importances_*100
indices_rfc = np.argsort(importances_rfc)[::-1]

num_bars=20
plt.figure(figsize=[8,6],dpi=150)
plt.title("Important features That Determines the Game Results")
plt.barh(range(num_bars), importances_rfc[indices_rfc[range(num_bars-1,-1,-1)]])
plt.yticks(range(num_bars), X.columns[indices_rfc[range(num_bars-1,-1,-1)]])
plt.ylim([-1, num_bars])
plt.xlabel("Relative Feature importance (%)")
plt.ylabel("Feature")
# plt.grid(b=1,linestyle='--')
# for a, b in enumerate(importances_rfc[indices_rfc[range(num_bars-1,-1,-1)]]):
#     b=round(b,2)
#     plt.text(b+3, a-0.3, '%s' % format(b,'.2f'), ha='center', va='bottom')
plt.show()

## Random Forest: Randomized Search Cross Validation with Time-series split 

In [None]:
# use randomized search to tune random forest
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import randint
from scipy.stats import uniform

parameters = {
    'n_estimators': np.arange(30,211, 30),
    'max_depth': randint(5,10),
    'max_features': ['sqrt','log2'],
    #'min_impurity_decrease': uniform(0,1),
    #'min_samples_leaf': randint(1,1000)
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=33),
    parameters,                      
#   scoring = 'precision',
    cv=TimeSeriesSplit(n_splits=10),
    n_iter=50,
    random_state=33,
    return_train_score=True,
    scoring = 'f1'
)

random_search.fit(X_train_std, y_train)

print(random_search.best_score_)
print(random_search.best_params_)

## SVM

### linear kernel

In [None]:
from sklearn.svm import SVC
svc = SVC(C=1,kernel='linear', random_state=33)
svc = svc.fit(X_train_std, y_train)

In [None]:
# plot model performance
print(svc.score(X_train_std, y_train))
print(svc.score(X_test_std, y_test))
y_pred = svc.predict(X_test_std)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
# plot confusion matrix
plot_confusion_matrix(svc, X_test_std, y_test, normalize='true', display_labels=['Loss','Win'],
                      cmap=plt.cm.Blues)
plt.show()

### nonlinear kernel

In [None]:
from sklearn.svm import SVC
svc = SVC(C=0.1,kernel='rbf', probability=True,random_state=33)
svc = svc.fit(X_train_std, y_train)

In [None]:
# plot model performance
print(svc.score(X_train_std, y_train))
print(svc.score(X_test_std, y_test))
y_pred = svc.predict(X_test_std)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [None]:
# plot confusion matrix
plot_confusion_matrix(svc, X_test_std, y_test, cmap=plt.cm.Blues)
plot_confusion_matrix(svc, X_test_std, y_test, display_labels=['Loss','Win'],
                             normalize='true', cmap=plt.cm.Blues)
plt.show()

In [None]:
# ROC curve

# predict probabilities
svc_probs = svc.predict_proba(X_test_std)
svc_probs = svc_probs[:, 1]

# calculate scores
ns_probs = [0 for i in range(len(y_test))]
ns_auc = roc_auc_score(y_test, ns_probs)
svc_auc = roc_auc_score(y_test, svc_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (svc_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
svc_fpr, svc_tpr, _ = roc_curve(y_test, svc_probs)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(svc_fpr, svc_tpr, marker='.', label='SVM')

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curve of SVM')
pyplot.legend(loc='lower right')
pyplot.show()

## SVM: Grid Search Cross Validation with Time-series split

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters = {
    'C':[0.1, 0.5, 1, 5],
    'gamma': [1,0.1,0.01],
    'kernel': ['linear','rbf','sigmoid']  
}

grid_search = GridSearchCV(
    SVC(random_state=33),             
    parameters,                      
    cv=TimeSeriesSplit(n_splits=10),
    return_train_score=True
)

grid_search.fit(X_train_std, y_train)

print(grid_search.best_score_)
print(grid_search.best_params_)

## Aggregate model graphs

In [None]:
# must run above codes to have the data

pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill'+' (area = '+'%.2f'%(ns_auc)+')')
pyplot.plot(neigh_fpr, neigh_tpr, linestyle='-', label='KNN'+' (area = '+'%.2f'%(neigh_auc)+')')
pyplot.plot(lr_fpr, lr_tpr, linestyle='-', label='Logistic'+' (area = '+'%.2f'%(lr_auc)+')')
pyplot.plot(rfc_fpr, rfc_tpr, linestyle='-', label='Random Forest'+' (area = '+'%.2f'%(rfc_auc)+')')
pyplot.plot(svc_fpr, svc_tpr, linestyle='-', label='SVM'+' (area = '+'%.2f'%(svc_auc)+')')

pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.title('ROC curves')
pyplot.legend(loc='lower right')
pyplot.show()