In [1]:
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

from databall.database import Database
from databall.plotting import format_538, plot_confusion_matrix
from databall.profit import profit
from databall.simulate import simulate

ModuleNotFoundError: No module named 'databall'

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
database = Database('../data/nba.db')
games = database.betting_stats(window=10)
games = games.loc[games.SEASON>=2006, stats + ['SEASON', 'GAME_DATE', var_predict]].dropna()

In [None]:
model = LogisticRegression(C=0.0001943920615336294, penalty='l2',
                           fit_intercept=True, intercept_scaling=134496.71823111628)
output_log = simulate(model, games, 2016, stats, var_predict)
days_log, per_log, profit_log = profit(output_log, var_predict=var_predict, bet_amount=100)
cm = confusion_matrix(output_log.HOME_SPREAD_WL, output_log.HOME_SPREAD_WL_PRED)

fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(cm, ['Loss', 'Win'], fig=fig)

title = 'Logistic regression struggles with losses'
subtitle = '''Logistic regression confusion matrix for predicting home
team results against the spread for the 2020 seson'''
format_538(fig, 'NBA Stats & Covers.com', title=title, subtitle=subtitle, ax=fig.axes,
           xoff=(-0.18, 1.02), yoff=(-0.17, -0.22), toff=(-.15, 1.32), soff=(-0.15, 1.2),
           n=50, bottomtick=np.nan)
plt.show()

print('_' * 52)
print(classification_report(output_log.HOME_SPREAD_WL, output_log.HOME_SPREAD_WL_PRED,
                            target_names=['Loss', 'Win'], digits=3))
print('Correctly predicted {:.2f}% of games'
      .format(accuracy_score(output_log.HOME_SPREAD_WL, output_log.HOME_SPREAD_WL_PRED) * 100))

In [None]:
model = LinearSVC(C=3.2563857398383885e-06, loss='squared_hinge',
                           fit_intercept=True, intercept_scaling=242.79319791592195)
output_svm = simulate(model, games, 2016, stats, var_predict)
days_svm, per_svm, profit_svm = profit(output_svm, var_predict=var_predict, bet_amount=100)
cm = confusion_matrix(output_svm.HOME_SPREAD_WL, output_svm.HOME_SPREAD_WL_PRED)

fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(cm, ['Loss', 'Win'], fig=fig)

title = 'SVM struggles with predicting wins'
subtitle = '''Support vector machine confusion matrix for predicting
home team results against the spread for the 2020 seson'''
format_538(fig, 'NBA Stats & Covers.com', title=title, subtitle=subtitle, ax=fig.axes,
           xoff=(-0.18, 1.02), yoff=(-0.17, -0.22), toff=(-.15, 1.32), soff=(-0.15, 1.2),
           n=50, bottomtick=np.nan)
plt.show()

print('_' * 52)
print(classification_report(output_svm.HOME_SPREAD_WL, output_svm.HOME_SPREAD_WL_PRED,
                            target_names=['Loss', 'Win'], digits=3))
print('Correctly predicted {:.2f}% of games'
      .format(accuracy_score(output_svm.HOME_SPREAD_WL, output_svm.HOME_SPREAD_WL_PRED) * 100))

In [3]:
model = RandomForestClassifier(n_estimators=34, criterion='entropy', class_weight='balanced',
                               bootstrap=True, random_state=8)
output_rf = simulate(model, games, 2016, stats, var_predict)
days_rf, per_rf, profit_rf = profit(output_rf, var_predict=var_predict, bet_amount=100)
cm = confusion_matrix(output_rf.HOME_SPREAD_WL, output_rf.HOME_SPREAD_WL_PRED)

fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(cm, ['Loss', 'Win'], fig=fig)

title = 'Random forest guesses on wins'
subtitle = '''Random forest confusion matrix for predicting home
team results against the spread for the 2020 seson'''
format_538(fig, 'NBA Stats & Covers.com', title=title, subtitle=subtitle, ax=fig.axes,
           xoff=(-0.18, 1.02), yoff=(-0.17, -0.22), toff=(-.15, 1.32), soff=(-0.15, 1.2),
           n=50, bottomtick=np.nan)
plt.show()

print('_' * 52)
print(classification_report(output_rf.HOME_SPREAD_WL, output_rf.HOME_SPREAD_WL_PRED,
                            target_names=['Loss', 'Win'], digits=3))
print('Correctly predicted {:.2f}% of games'
      .format(accuracy_score(output_rf.HOME_SPREAD_WL, output_rf.HOME_SPREAD_WL_PRED) * 100))

NameError: name 'simulate' is not defined

In [None]:
model = MLPClassifier(alpha=5.700733605522687e-06, hidden_layer_sizes=49,
                      solver='lbfgs', activation='relu', max_iter=500, random_state=8)
output_mlp = simulate(model, games, 2020, stats, var_predict)
days_mlp, per_mlp, profit_mlp = profit(output_mlp, var_predict=var_predict, bet_amount=100)
cm = confusion_matrix(output_mlp.HOME_SPREAD_WL, output_mlp.HOME_SPREAD_WL_PRED)

fig = plt.figure(figsize=(8, 8))
plot_confusion_matrix(cm, ['Loss', 'Win'], fig=fig)

title = 'MLP performs the best by a slim margin'
subtitle = '''Neural network confusion matrix for predicting home
team results against the spread for the 2016 seson'''
format_538(fig, 'NBA Stats & Covers.com', title=title, subtitle=subtitle, ax=fig.axes,
           xoff=(-0.18, 1.02), yoff=(-0.17, -0.22), toff=(-.15, 1.32), soff=(-0.15, 1.2),
           n=50, bottomtick=np.nan)
plt.show()

print('_' * 52)
print(classification_report(output_mlp.HOME_SPREAD_WL, output_mlp.HOME_SPREAD_WL_PRED,
                            target_names=['Loss', 'Win'], digits=3))
print('Correctly predicted {:.2f}% of games'
      .format(accuracy_score(output_mlp.HOME_SPREAD_WL, output_mlp.HOME_SPREAD_WL_PRED) * 100))

In [None]:
fig = plt.figure(figsize=(12, 10))
ax1 = plt.subplot(211)
ax1.plot_date(days_log, per_log*100, label='Logistic Regression')
ax1.plot_date(days_svm, per_svm*100, label='Support Vector Machine')
ax1.plot_date(days_rf, per_rf*100, label='Random Forest')
ax1.plot_date(days_mlp, per_mlp*100, label='Neural Network', color='green')
ax1.set_ylabel('Cumulative Accuracy')
ax1.set_ylim(-5)
ax1.legend(fontsize=16, bbox_to_anchor=(1.01, 1), borderaxespad=0)

ax2 = plt.subplot(212)
ax2.plot_date(days_log, profit_log/1000)
ax2.plot_date(days_svm, profit_svm/1000)
ax2.plot_date(days_rf, profit_rf/1000)
ax2.plot_date(days_mlp, profit_mlp/1000, color='green')
ax2.set_xlabel('Date')
ax2.set_ylabel('Cumulative Profit')
ax2.set_ylim(-3, 13)

title = 'Not all of the models generalize well'
subtitle = '''Cumulative prediction accuracy and subsequent profit from placing $100 bets
on every game throughout the 2020 season using different betting strategies'''
format_538(fig, 'NBA Stats & Covers.com', ax=(ax1, ax2), title=title, subtitle=subtitle,
           xoff=(-0.1, 1.01), yoff=(-1.38, -1.45), toff=(-.09, 1.28), soff=(-0.09, 1.08), 
           prefix = (' ', '$'), suffix=('%', 'k'), suffix_offset=(3, 1), n=80)
plt.show()

In [None]:
model = LogisticRegression()
output_log_df = simulate(model, games, 2016, stats, var_predict)
days_log_df, per_log_df, profit_log_df = profit(output_log_df, var_predict=var_predict, bet_amount=100)

model = LinearSVC()
output_svm_df = simulate(model, games, 2016, stats, var_predict)
days_svm_df, per_svm_df, profit_svm_df = profit(output_svm_df, var_predict=var_predict, bet_amount=100)

model = RandomForestClassifier(random_state=8)
output_rf_df = simulate(model, games, 2016, stats, var_predict)
days_rf_df, per_rf_df, profit_rf_df = profit(output_rf_df, var_predict=var_predict, bet_amount=100)

model = MLPClassifier(random_state=8)
output_mlp_df = simulate(model, games, 2016, stats, var_predict)
days_mlp_df, per_mlp_df, profit_mlp_df = profit(output_mlp_df, var_predict=var_predict, bet_amount=100)

fig = plt.figure(figsize=(12, 10))
ax1 = plt.subplot(211)
ax1.plot_date(days_log_df, per_log_df*100, label='Logistic Regression')
ax1.plot_date(days_svm_df, per_svm_df*100, label='Support Vector Machine')
ax1.plot_date(days_rf_df, per_rf_df*100, label='Random Forest')
ax1.plot_date(days_mlp_df, per_mlp_df*100, label='Neural Network', color='green')
ax1.set_ylabel('Cumulative Accuracy')
ax1.set_ylim(-5)
ax1.legend(fontsize=16, bbox_to_anchor=(1.01, 1), borderaxespad=0)

ax2 = plt.subplot(212)
ax2.plot_date(days_log_df, profit_log_df/1000)
ax2.plot_date(days_svm_df, profit_svm_df/1000)
ax2.plot_date(days_rf_df, profit_rf_df/1000)
ax2.plot_date(days_mlp_df, profit_mlp_df/1000, color='green')
ax2.set_xlabel('Date')
ax2.set_ylabel('Cumulative Profit')

title = 'Optimizing parameters hurt logistic regression accuracy'
subtitle = '''Cumulative prediction accuracy and subsequent profit from placing $100 bets
on every game throughout the 2016 season using different betting strategies'''
format_538(fig, 'NBA Stats & Covers.com', ax=(ax1, ax2), title=title, subtitle=subtitle,
           xoff=(-0.12, 1.01), yoff=(-1.38, -1.45), toff=(-.09, 1.28), soff=(-0.09, 1.08), 
           prefix = (' ', '$'), suffix=('%', 'k'), suffix_offset=(3, 1), n=80)
plt.show()

In [None]:
model = LogisticRegression(C=0.0001943920615336294, penalty='l2',
                           fit_intercept=True, intercept_scaling=134496.71823111628)
output_log_evo = simulate(model, games, 2016, stats, var_predict, evolve=True)
days_log_evo, per_log_evo, profit_log_evo = profit(output_log_evo, var_predict=var_predict, bet_amount=100)

model = LogisticRegression()
output_log_df_evo = simulate(model, games, 2016, stats, var_predict, evolve=True)
days_log_df_evo, per_log_df_evo, profit_log_df_evo = profit(output_log_df_evo, var_predict=var_predict,
                                                            bet_amount=100)

fig = plt.figure(figsize=(12, 10))
ax1 = plt.subplot(211)
ax1.plot_date(days_log, per_log*100, label='Optimized Parameters')
ax1.plot_date(days_log_evo, per_log_evo*100, label='Optimized Parameters\nw/ Evolution')
ax1.plot_date(days_log_df, per_log_df*100, label='Default Model')
ax1.plot_date(days_log_df_evo, per_log_df_evo*100, label='Default Model\nw/ Evolution', color='green')
ax1.set_ylabel('Cumulative Accuracy')
ax1.set_ylim(-5)
ax1.legend(fontsize=16, bbox_to_anchor=(1.01, 1), borderaxespad=0)

ax2 = plt.subplot(212)
ax2.plot_date(days_log, profit_log/1000)
ax2.plot_date(days_log_evo, profit_log_evo/1000)
ax2.plot_date(days_log_df, profit_log_df/1000)
ax2.plot_date(days_log_df_evo, profit_log_df_evo/1000, color='green')
ax2.set_xlabel('Date')
ax2.set_ylabel('Cumulative Profit')

title = 'Retraining models in-season can be detrimental'
subtitle = '''Cumulative prediction accuracy and subsequent profit from placing $100 bets
on every game throughout the 2020 season using different betting strategies'''
format_538(fig, 'NBA Stats & Covers.com', ax=(ax1, ax2), title=title, subtitle=subtitle,
           xoff=(-0.12, 1.01), yoff=(-1.38, -1.45), toff=(-.09, 1.28), soff=(-0.09, 1.08), 
           prefix = (' ', '$'), suffix=('%', 'k'), suffix_offset=(3, 1), n=80)
plt.show()