### Import Libraries and Mount Drive

In [None]:
!pip install xlsxwriter
import xlsxwriter
import xlrd
import openpyxl
from openpyxl import load_workbook
import os
import os.path
from os import path
import datetime as dt
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import csv
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
pd.options.mode.chained_assignment = None
from google.colab import drive 
drive.mount('/content/drive') 

In [None]:
os.chdir("drive/My Drive")  # after you run this once you will get an error if you run again. You can ignore

# SET PARAMETERS AND CHOOSE ANALYSES

In [None]:
# Create a new spread threshold to base bet decisions on
create_new_threshold = False

global favorite_threshold 
favorite_threshold = 4
global underdog_threshold 
underdog_threshold = -4
global bet_amount
bet_amount = 10

# To restrict the months of analysis
restrict_months = False
start_month = 2
end_month = 10
plot_histograms = True
plot_confusions = True

## CHOOSE DATA TO EXPORT
# analysis = 'offensive'
# analysis = 'defensive'
analysis = 'differential'

print_training_output = True
print_test_output = True
print_cv_output = True
plot_feature_importance = True

## Choose Models to Run
run_logistic = True
run_rf_random_search = True
run_svc = True
run_extra_trees = True
run_xgb = True

In [None]:
if create_new_threshold == False:
  full_box_scores = pd.read_excel('SML_Term_Project/NBA_Data/Data_Files/Game_Outcomes_and_Odds+{}{}.xlsx'.format(str(favorite_threshold), 
                                                                                                                 str(underdog_threshold)))
  game_outcomes = full_box_scores

  combined =  pd.read_excel('SML_Term_Project/NBA_Data/Data_Files/Outcomes_Bets_Team_Differentials_+{}{}.xlsx'.format(str(favorite_threshold), 
                                                                                                                      str(underdog_threshold)))

# EXTRACT ODDS AND SCORES FOR GAMES / LABEL BETTING THRESHOLD

In [None]:
if create_new_threshold:
  
  seasons = ['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', 
            '2019-20', '2020-21']
  
  season_end_dates = ['"2011-04-15"', '"2012-04-27"', '"2013-04-19"', '"2014-04-18"',
                      '"2015-04-17"', '"2016-04-15"', '"2017-04-14"', '"2018-04-13"', '"2019-04-12"', 
                      '"2020-08-17"', '"2021-05-21"']

  # remove playoff games
  season_end_dates = [dt.datetime.strptime(date, '"%Y-%m-%d"').date() for date in season_end_dates]

  # create empty dataframe
  full_box_scores = pd.DataFrame()

  # add each years games to the dataframe. Each game is two lines, one for home team, one for away team. The loop 
  # splits the home and away teams, and merges the data so they make a single line per game.

  year = 2011
  i = 0

  for season in seasons:

      yearly_odds = pd.read_excel("SML_Term_Project/NBA_Data/Yearly_Odds/{}.xlsx".format(season))
      visiting_teams = yearly_odds[yearly_odds['VH'] == 'V']
      home_teams = yearly_odds[yearly_odds['VH'] == 'H']
      yearly_box_scores = home_teams.reset_index().merge(visiting_teams.reset_index(), 
                                                  left_index=True, right_index=True, how='left')
      
      # Year is single year. For each season the year is labeled as the year the season ends
      yearly_box_scores['Season'] = year
      yearly_box_scores['Date_x'] = pd.to_datetime(yearly_box_scores['Date_x']).dt.date
      yearly_box_scores = yearly_box_scores[(yearly_box_scores['Date_x'] < season_end_dates[i])]
      full_box_scores = full_box_scores.append(yearly_box_scores)

      year += 1
      i += 1

  # Rename columns
  full_box_scores.columns = ['index_H','Date','Rot_H','VH_H','Team_H','1st_H','2nd_H','3rd_H', '4th_H','Final_H',
                            'Open_H','Close_H','ML_H','2H_H','index_V','Date_V','Rot_V','VH_V','Team_V','1st_V',
                            '2nd_V','3rd_V','4th_V','Final_V','Open_V','Close_V','ML_V','2H_V', 'Season']

  # Drop unneeded columns
  full_box_scores.drop(['index_H', 'Date_V', 'index_V', 'VH_H', 'VH_V'], axis=1)

  # Reorder columns
  full_box_scores = full_box_scores[['Season', 'Date','Rot_H', 'Team_H','Team_V', '1st_H','2nd_H','3rd_H', '4th_H',
                            'Open_H','Close_H','ML_H','2H_H', 'Rot_V','1st_V',
                            '2nd_V','3rd_V','4th_V','Open_V','Close_V','ML_V','2H_V', 'Final_H', 'Final_V']]

  # Some games indicate a tie. Removing them all for now.
  full_box_scores = full_box_scores[full_box_scores['Final_H'] != full_box_scores['Final_V']]

  ## The code below allows you to restrict the months of the year that the model predicts the outcome

  import datetime as dt

  if restrict_months:

    full_box_scores['season_date'] = [x[:-4] for x in full_box_scores['Date']]
    full_box_scores['Date_month'] = full_box_scores['Date'].astype('datetime64').dt.month
    mask = (full_box_scores['Date_month'] >= start_month) & (full_box_scores['Date_month'] < end_month)
    full_box_scores = full_box_scores[mask]

  betting_lines = full_box_scores [['Open_H', 'Open_V']]

  betting_lines = betting_lines.replace('pk', 0) # pk represents 'pick em' which you typically can't bet on
  betting_lines = betting_lines.replace('PK', 0) # pk represents 'pick em' which you typically can't bet on

  total_points = []
  point_spreads = []

  # The vegas spread and vegas total points are initially in the same column. 
  # One is on the line for the home team and the other for the away team. However
  # the order is not always consistent. Now that we have separated home and away
  # and merged them to be side by side, we will compare the columns and choose
  # the lesser of the values to be the spread and the greater value to be the
  # total points.

  for col1, col2 in zip(betting_lines['Open_H'], betting_lines['Open_V']):

    total_points.append(max(col1, col2))
    point_spreads.append(min(col1, col2))


  full_box_scores['Vegas Total Points'] = total_points
  full_box_scores['Vegas Spread'] = point_spreads
  full_box_scores['Real Total Points'] = full_box_scores['Final_H'] + full_box_scores['Final_V']


  # Vegas sometimes indicates "Pick 'em" to indicate they do not have a spread
  # to bet on for that game. We have changed these values to 0 and are now 
  # removing these samples from the data

  full_box_scores = full_box_scores[full_box_scores['Vegas Spread'] != 0]

  # Drop the now uncecessary columns
  full_box_scores = full_box_scores[['Season', 'Date', 'Team_H', 'Team_V', 'ML_H', 'ML_V', 'Final_H', 'Final_V', 'Vegas Total Points', 
                                     'Real Total Points', 'Vegas Spread']]

  # Change values from type 'object' to int
  full_box_scores['ML_H'] = full_box_scores['ML_H'].astype(int)
  full_box_scores['ML_V'] = full_box_scores['ML_V'].astype(int)


  # if the money lines are the same sign, an error has occurred. Remove them
  full_box_scores = full_box_scores[full_box_scores['ML_V']*full_box_scores['ML_H'] < 0]

  # Add a binary attribute indicating whether the favorite was home or away
  full_box_scores.loc[full_box_scores['ML_H'] <0, ['Favorite Home?']] = '1'
  full_box_scores.loc[full_box_scores['ML_H'] > 0, ['Favorite Home?']] = '0'

  full_box_scores['Favorite_Team'] = np.where(full_box_scores.ML_H < 0, full_box_scores['Team_H'], full_box_scores['Team_V'])
  full_box_scores['Underdog_Team'] = np.where(full_box_scores.ML_H > 0, full_box_scores['Team_H'], full_box_scores['Team_V'])

  full_box_scores['Favorite_ML'] = np.where(full_box_scores.ML_H < 0, full_box_scores['ML_H'], full_box_scores['ML_V'])
  full_box_scores['Underdog_ML'] = np.where(full_box_scores.ML_H > 0, full_box_scores['ML_H'], full_box_scores['ML_V'])

  full_box_scores['Favorite_Score'] = np.where(full_box_scores.ML_H < 0, full_box_scores['Final_H'], full_box_scores['Final_V'])
  full_box_scores['Underdog_Score'] = np.where(full_box_scores.ML_H < 0, full_box_scores['Final_V'], full_box_scores['Final_H'])

  # Drop the now unneeded columns
  full_box_scores = full_box_scores.drop(['Team_H', 'Team_V', 'ML_H', 'ML_V', 'Final_H', 'Final_V'], axis=1)


  # Calculate the real spread from the game scores and the difference from the vegas predictions
  full_box_scores['Real Spread'] = full_box_scores['Favorite_Score'] - full_box_scores['Underdog_Score']
  full_box_scores['Spread Difference'] = full_box_scores['Real Spread'] - full_box_scores['Vegas Spread']


  # Remove outliers/incorrect observations
  full_box_scores = full_box_scores[full_box_scores['Vegas Spread'] > -50]
  full_box_scores = full_box_scores[full_box_scores['Vegas Spread'] < 50]

  def que(x):
      if x['Favorite_Score'] > x['Underdog_Score']:
          return 'Favorite won'
      if x['Favorite_Score'] < x['Underdog_Score']:
        return 'Underdog won'

  full_box_scores['Game Outcome'] = full_box_scores.apply(que, axis=1)

  fav_covered = full_box_scores[(full_box_scores['Game Outcome'] == 'Favorite won') & (full_box_scores['Spread Difference'] > 0)]

  underdog_covered =  full_box_scores.merge(fav_covered, on=['Season','Date', 'Favorite_Team', 'Underdog_Team'], how='left')
  underdog_covered = underdog_covered[underdog_covered.isna().any(axis=1)]

  underdog_covered = underdog_covered.loc[:, :'Game Outcome_x']

  underdog_covered.columns = ['Season', 'Date', 'Vegas Total Points', 'Real Total Points', 
        'Vegas Spread', 'Favorite Home?', 'Favorite_Team', 'Underdog_Team',
        'Favorite_ML', 'Underdog_ML', 'Favorite_Score', 'Underdog_Score', 
        'Real Spread', 'Spread Difference', 'Game Outcome']


  fav_covered['Status'] = 'Favorite covered'
  underdog_covered['Status'] = 'Underdog covered'

  fav_covered.loc[fav_covered['Spread Difference'] >= favorite_threshold, ['Decision']] = 'Bet Favorite'
  fav_covered.loc[fav_covered['Spread Difference'] < favorite_threshold, ['Decision']] = "Don't Bet"

  underdog_covered.loc[underdog_covered['Spread Difference'] <= underdog_threshold, ['Decision']] = 'Bet Underdog'
  underdog_covered.loc[underdog_covered['Spread Difference'] >underdog_threshold, ['Decision']] = "Don't Bet"

  # Recombine all the data with the target labels

  full_box_scores = fav_covered.append(underdog_covered)

  full_box_scores = full_box_scores[['Season', 'Date', 'Favorite_Team', 'Underdog_Team', 'Favorite_ML',
        'Underdog_ML', 'Favorite_Score', 'Underdog_Score', 'Vegas Total Points', 'Real Total Points',
        'Vegas Spread', 'Real Spread', 'Spread Difference',
        'Game Outcome', 'Status', 'Decision',
        'Favorite Home?']]


  full_box_scores.to_excel('SML_Term_Project/NBA_Data/Data_Files/Game_Outcomes_and_Odds+{}{}.xlsx'.format(favorite_threshold, 
                                                                                                          underdog_threshold), index = False)

# Exploratory Data Analysis and Visualization

In [None]:
full_box_scores['Vegas Spread'].min()
full_box_scores['Vegas Spread'].max()
full_box_scores['Vegas Total Points'].min()
full_box_scores['Vegas Total Points'].max()

full_box_scores.sort_values(by='Vegas Spread', ascending=False) # We should remove Vegas spread < -50
full_box_scores.sort_values(by='Vegas Total Points', ascending=False) # We should remove values greater than 250

In [None]:
if plot_histograms:
  
  x = full_box_scores['Vegas Spread']
  y = full_box_scores['Real Spread']
  z = full_box_scores['Vegas Total Points']
  p = full_box_scores['Real Total Points']

  plt.rcParams["figure.figsize"] = (14, 12)

  fig, axs = plt.subplots(2, 2, sharey=False, tight_layout=True)

  # We can set the number of bins with the `bins` kwarg
  axs[0][0].hist(x, bins=50)
  axs[0][0].set_xlim(0, 25)
  axs[0][0].set_title("Distribution of Vegas Point Spread", size=16)
  axs[0][1].hist(y, bins=60)
  axs[0][1].set_title("Distribution of Actual Point Spread", size=16)

  axs[1][0].hist(z, bins=1000)
  axs[1][0].set_xlim(140, 280)
  axs[1][0].set_title("Distribution of Vegas Total Points", size=16)
  axs[1][1].hist(p, bins=100)
  axs[1][1].set_title("Distribution of Actual Total Points", size=16)

  plt.show()

In [None]:
if plot_histograms:

  plt.rcParams["figure.figsize"] = (12,8)

  plt.hist(full_box_scores['Spread Difference'], bins =100)
  # plt.xlim(0, 1000)
  plt.title("Distribution of Differences Between Actual Point Spreads and Vegas Predictions ", size=16)
  plt.show()

In [None]:
if plot_histograms:

  plt.rcParams["figure.figsize"] = (8,6)

  n, bins, patches = plt.hist(full_box_scores['Spread Difference'], bins =100, color = 'darkgray')

  for i in range(0, 46):
    patches[i].set_fc('orange')

  for i in range(61, 100):
    patches[i].set_fc('orange')


  # plt.xlim(0, 1000)
  plt.xlabel("Vegas Predicted Spread Minus the Actual Point Spread", size=12)
  plt.title("How Close is the Predicted Spread to the Actual Point Spread?", size=14)
  plt.show()

In [None]:
if plot_histograms:

  z = full_box_scores['Spread Difference']

  plt.rcParams["figure.figsize"] = (5, 10)

  fig, axs = plt.subplots(3, 1, sharey=False, tight_layout=True)

  # We can set the number of bins with the `bins` kwarg
  n, bins, patches = axs[0].hist(z, bins=100)
  axs[0].set_title("Games Labeled to Bet with 6 Point Threshold", size=14)

  for i in range(0, 45):
    patches[i].set_fc('orange')

  for i in range(59, 100):
    patches[i].set_fc('orange')

  n, bins, patches =axs[1].hist(z, bins=100)
  axs[1].set_title("Games Labeled to Bet with 4 Point Threshold", size=14)
  for i in range(0, 47):
    patches[i].set_fc('orange')

  for i in range(57, 100):
    patches[i].set_fc('orange')

  n, bins, patches =axs[2].hist(z, bins=100)


  axs[2].set_title("Games Labeled to Bet with 2 Point Threshold", size=14)
  for i in range(0, 50):
    patches[i].set_fc('orange')

  for i in range(54, 100):
    patches[i].set_fc('orange')

  axs[2].set_xlabel("Difference Between Predicted and Actual Spread", size =12)

  plt.show()

In [None]:
print("Frequencies of real game outcomes:")
print("----------------------------------------")
print(full_box_scores['Game Outcome'].value_counts())
print("\n")

print("Frequencies of real game outcomes:")
print("----------------------------------------")
print(full_box_scores['Status'].value_counts())
print("\n")

# How often does our model indicate we should bet?
print("The outcome of our calculated decisions:")
print("----------------------------------------")
print(full_box_scores.Decision.value_counts())

# EXTRACT YEARLY SEASON STATS FOR TEAMS

### Use the game outcomes and odds to now incorporate the team statistics to the data

In [None]:
if create_new_threshold:

  game_outcomes = full_box_scores

  seasons = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']

  full_team_statistics = pd.DataFrame()


  for season in seasons:

      if analysis == 'offensive':
        team_stats = ('https://www.espn.com/nba/stats/team/_/season/{}/seasontype/2'.format(season))
      elif analysis == 'defensive':
        team_stats = ('https://www.espn.com/nba/stats/team/_/view/opponent/season/{}/seasontype/2'.format(season))
      elif analysis == 'differential':
        team_stats = ('https://www.espn.com/nba/stats/team/_/view/differential/season/{}/seasontype/2'.format(season))
      

      all_team_data = pd.read_html(team_stats, header=0)

      teams = all_team_data[0]
      team_data = all_team_data[1] 

      teams['row_id'] = np.arange(teams.shape[0])
      team_data['row_id'] = np.arange(team_data.shape[0])

      yearly_stats = pd.merge(teams, team_data, on='row_id', how='outer')
      yearly_stats.drop(['row_id', 'GP'], axis =1)

      yearly_stats['Season'] = season
      full_team_statistics = full_team_statistics.append(yearly_stats)


  full_team_statistics = full_team_statistics.drop(['row_id', 'RK', 'GP'], axis=1)

  # Update the names of the teams. Needed for future merging with other data files

  full_team_statistics = full_team_statistics.replace('Los Angeles Lakers', 'LALakers')
  full_team_statistics = full_team_statistics.replace('LA Clippers', 'LAClippers')
  full_team_statistics['Team'] = full_team_statistics['Team'].str.rsplit(' ',1).str[0]
  full_team_statistics['Team'] = full_team_statistics['Team'].str.replace(' ', '')

  full_team_statistics['Season'] = full_team_statistics['Season'].astype(int)


  ## Combine the game odds and the team statistics and game odds for the favorites, followed by the team statistics for the underdogs

  combined = pd.merge(game_outcomes, full_team_statistics,  how='inner',left_on=['Season', 'Favorite_Team'],right_on=['Season','Team'])

  combined.rename(columns={'PTS':'PTS_F','FGM': 'FGM_F','FGA': "FGA_F", 'FG%': "FG%_F", '3PM':'3PM_F', '3PA': '3PA_F', '3P%': '3P%_F', 'FTM': 'FTM_F',
                          'FTA': 'FTA_F', 'FT%': 'FT%_F', 'OR': 'OR_F', 'DR': 'DR_F', 'REB': 'REB_F', 'AST': 'AST_F', 'STL': 'STL_F', 'BLK': 'BLK_F',
                          'TO': 'TO_F', 'PF': 'PF_F'
                          }, inplace=True)

  combined = combined.drop(['Team'], axis=1)

  combined = pd.merge(combined, full_team_statistics,  how='inner',left_on=['Season', 'Underdog_Team'],right_on=['Season','Team'])

  combined.rename(columns={'PTS':'PTS_U','FGM': 'FGM_U','FGA': "FGA_U", 'FG%': "FG%_U", '3PM':'3PM_U', '3PA': '3PA_U', '3P%': '3P%_U', 'FTM': 'FTM_U',
                          'FTA': 'FTA_U', 'FT%': 'FT%_U', 'OR': 'OR_U', 'DR': 'DR_U', 'REB': 'REB_U', 'AST': 'AST_U', 'STL': 'STL_U', 'BLK': 'BLK_U',
                          'TO': 'TO_U', 'PF': 'PF_U'
                          }, inplace=True)

  combined = combined.drop(['Team'], axis=1)

  # Export to excel based on the statistics retrieved

  if analysis == 'offensive':
    combined.to_excel('SML_Term_Project/NBA_Data/Data_Files/Outcomes_Bets_Team_Offensive_+{}{}.xlsx'.format(favorite_threshold, 
                                                                                                            underdog_threshold), index = False)
  elif analysis == 'defensive':
    combined.to_excel('SML_Term_Project/NBA_Data/Data_Files/Outcomes_Bets_Team_Defensive_+{}{}.xlsx'.format(favorite_threshold, 
                                                                                                                underdog_threshold), index = False)
  elif analysis == 'differential':
    combined.to_excel('SML_Term_Project/NBA_Data/Data_Files/Outcomes_Bets_Team_Differentials_+{}{}.xlsx'.format(favorite_threshold, 
                                                                                                                underdog_threshold), index = False)

# PREPARE THE DATA

## Run a Lasso Regression to determine which features to keep

In [None]:
# Lasso CV 

df = combined
df = df.drop_duplicates()

df = df.drop(['Season', 'Date', 'Favorite_Team', 'Underdog_Team', 
              'Favorite_Score', 'Underdog_Score', 'Favorite_ML', 'Underdog_ML',
              'Favorite_ML','Underdog_ML', 'Vegas Total Points', 
              'Real Total Points', 'Real Spread', 'Game Outcome', 'Status', 
              'Decision'], axis=1)

df = df.rename(columns={"Favorite Home?": "Home_F"})

x = df.drop(['Spread Difference'],axis=1)
y = df['Spread Difference']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,
                                                    random_state=42)

ct = ColumnTransformer([
        ('somename', StandardScaler(), ['Vegas Spread', 'PTS_F', 'FGM_F', 'FGA_F', 'FG%_F',
       '3PM_F', '3PA_F', '3P%_F', 'FTM_F', 'FTA_F', 'FT%_F', 'OR_F', 'DR_F',
       'REB_F', 'AST_F', 'STL_F', 'BLK_F', 'TO_F', 'PF_F', 'PTS_U', 'FGM_U',
       'FGA_U', 'FG%_U', '3PM_U', '3PA_U', '3P%_U', 'FTM_U', 'FTA_U', 'FT%_U',
       'OR_U', 'DR_U', 'REB_U', 'AST_U', 'STL_U', 'BLK_U', 'TO_U', 'PF_U'])
    ], remainder='passthrough')


x_scaled = ct.fit_transform(x_train)
x_test_scaled = ct.transform(x_test)

x_scaled = pd.DataFrame(x_scaled, columns = x_train.columns)
x_test_scaled =  pd.DataFrame(x_test_scaled, columns = x_train.columns)

from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1).fit(x_scaled, y_train)

for i in range(len(clf.coef_)):
  if abs(clf.coef_[i]) > 0.01:
    print("{} = {:.4} ".format(x_train.columns[i], str(clf.coef_[i])))

print("\nLasso Intercept: {:.4}".format(clf.intercept_))

### From Lasso - 
# df = df[['Spread Difference', 'Vegas Spread', 'Home_F', '3PA_F', 'OR_F', 'DR_F', 'PF_F', 
#         'DR_U', 'STL_U', 'PF_U']]


# df.to_excel('SML_Term_Project/NBA_Data/Data_Files/Lasso_Dataframe.xlsx', index = False)

In [None]:
# Revert data back prior to splitting for Lasso

global df_copy
df = combined
df_copy = df
df_copy = df_copy[['Season', 'Date', 'Favorite_Team', 'Underdog_Team', 
                   'Favorite_ML', 'Underdog_ML', 'Status', 'Decision']]

In [None]:
df = df.drop(['Season', 'Date', 'Favorite_Team', 'Underdog_Team', 'Favorite_Score', 
              'Underdog_Score', 'Favorite_ML', 'Underdog_ML', 'Favorite_ML',
              'Underdog_ML', 'Vegas Total Points', 'Real Total Points',
              'Real Spread', 'Spread Difference', 'Game Outcome', 'Status'], axis=1)

In [None]:
df = df[['Decision', 'Vegas Spread', 'PTS_F', 'FGM_F', 'FGA_F', 'FG%_F',
       '3PM_F', '3PA_F', '3P%_F', 'FTM_F', 'FTA_F', 'FT%_F', 'OR_F', 'DR_F',
       'REB_F', 'AST_F', 'STL_F', 'BLK_F', 'TO_F', 'PF_F', 'PTS_U', 'FGM_U',
       'FGA_U', 'FG%_U', '3PM_U', '3PA_U', '3P%_U', 'FTM_U', 'FTA_U', 'FT%_U',
       'OR_U', 'DR_U', 'REB_U', 'AST_U', 'STL_U', 'BLK_U', 'TO_U', 'PF_U', 
       'Favorite Home?']]

df = df.rename(columns={"Favorite Home?": "Home_F"})

## Label the Data

In [None]:
df = df.replace("Don't Bet", 0)
df = df.replace('Bet Underdog', 1)
df = df.replace('Bet Favorite', 2)

### Split the Data: Train Test Split

In [None]:
x = df.drop(['Decision'],axis=1)
y = df['Decision']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### Standardize the features (except for the binary feature "Favorite Home?")

In [None]:
ct = ColumnTransformer([
        ('somename', StandardScaler(), ['Vegas Spread', 'PTS_F', 'FGM_F', 'FGA_F', 'FG%_F',
       '3PM_F', '3PA_F', '3P%_F', 'FTM_F', 'FTA_F', 'FT%_F', 'OR_F', 'DR_F',
       'REB_F', 'AST_F', 'STL_F', 'BLK_F', 'TO_F', 'PF_F', 'PTS_U', 'FGM_U',
       'FGA_U', 'FG%_U', '3PM_U', '3PA_U', '3P%_U', 'FTM_U', 'FTA_U', 'FT%_U',
       'OR_U', 'DR_U', 'REB_U', 'AST_U', 'STL_U', 'BLK_U', 'TO_U', 'PF_U'])
    ], remainder='passthrough')

x_scaled = ct.fit_transform(x_train)
x_test_scaled = ct.transform(x_test)


In [None]:
# place back into dataframes. Easier to visualize onwards

x_scaled = pd.DataFrame(x_scaled, columns = x_train.columns)
x_test_scaled =  pd.DataFrame(x_test_scaled, columns = x_train.columns)

# Subset data based on Lasso Regression
x_scaled = x_scaled[['Vegas Spread', 'Home_F', '3PA_F', 'OR_F', 'DR_F', 'PF_F', 
         'DR_U', 'STL_U', 'PF_U']]

x_test_scaled = x_test_scaled[['Vegas Spread', 'Home_F', '3PA_F', 'OR_F', 'DR_F', 'PF_F', 
         'DR_U', 'STL_U', 'PF_U']]

# Print/Save/Create Outputs

In [None]:
# define scoring metrics. See code above for options to choose from
cv_scoring = ('balanced_accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted')

def print_and_save_output(which_samples, model_type, x_train, y_train_true, y_train_pred, x_test, y_test_true, y_test_pred, cv_results = None):

  training_acc = accuracy_score(y_train_true, y_train_pred, normalize=True)
  training_weighted_recall = metrics.recall_score(y_train_true, y_train_pred, average = 'weighted')
  training_weighted_precision = metrics.precision_score(y_train_true, y_train_pred, average = 'weighted')
  training_f1 = metrics.f1_score(y_train_true, y_train_pred, average = 'weighted')

  testing_acc = accuracy_score(y_test_true, y_test_pred, normalize=True)
  testing_weighted_precision = metrics.precision_score(y_test_true, y_test_pred, average = 'weighted')
  testing_weighted_recall = metrics.recall_score(y_test_true, y_test_pred, average = 'weighted')
  testing_f1 = metrics.f1_score(y_test_true, y_test_pred, average = 'weighted')


  if print_training_output:
    print("TRAINING RESULTS")
    print("-----------------")
    print("Model Accuracy: {:.3}".format(training_acc))
    print("Weighted recall score: {:.3}".format(training_weighted_recall))
    print("Weightedprecision score: {:.3}".format(training_weighted_precision))
    print("Weighted f1 score: {:.3}\n".format(training_f1))
  
  if print_cv_output:
    print('\nCROSS VALIDATION RESULTS')
    print("-------------------------")
    print("CV Test Accuracy ", round(cv_results['test_balanced_accuracy'].mean(), 3))
    print("CV Test Recall Weighted ", round(cv_results['test_recall_weighted'].mean(), 3))
    print("CV Test Precision Weighted ", round(cv_results['test_precision_weighted'].mean(), 3))    
    print("CV Test F1 Weighted ", round(cv_results['test_f1_weighted'].mean(), 3))


    # for measure in sorted(cv_results.keys()):
    #   print(measure, round(cv_results[measure].mean(), 3))
  
  if print_test_output:
    print("\n\nTEST RESULTS")
    print("-----------------")
    print("Model Accuracy: {:.4}".format(testing_acc))
    print("Weighted-average recall score: {:.4}".format(testing_weighted_recall))
    print("Weighted-average precision score: {:.4}".format(testing_weighted_precision))
    print("Weighted-average f1 score: {:.4}".format(testing_f1))

  results_to_save = [model_type, favorite_threshold, underdog_threshold, training_acc, training_weighted_precision, training_weighted_recall, 
                     cv_results['train_f1_weighted'].mean(), cv_results['test_balanced_accuracy'].mean(), 
                     cv_results['test_precision_weighted'].mean(), cv_results['test_recall_weighted'].mean(),
                     cv_results['test_f1_weighted'].mean(), testing_acc, testing_weighted_precision, testing_weighted_recall, testing_f1]

############################################################

  output_columns = ['Model', 'Favorite Threshold', 'Underdog Threshold', 'Train Accuracy', 'Train Weighted Precision', 'Train Weighted Recall', 
                    'Train F1', 'CV Accuracy', 'CV Precision', 'CV Recall', 'CV F1', 'Test Acc', 'Test Precision', 'Test Recall', 'Test F1']
           
  model_outputs = pd.DataFrame(columns = output_columns)

  temp = pd.DataFrame([results_to_save], columns=output_columns)
  model_outputs = pd.concat([temp, model_outputs])
  model_outputs['Analysis'] = analysis

############################################################

  global betting_file

  ## TESTING 
  if (which_samples == 'Test'):
    y_test_df = pd.DataFrame(y_test_true)
    y_test_df['Prediction'] = y_test_pred
    
    testing_labels_preds = x_test.join(y_test_df)
    testing_labels_preds = testing_labels_preds[['Decision', 'Prediction']]
    testing_labels_preds = testing_labels_preds.sort_index()

    betting_file_TESTING_DATA = df_copy.join(testing_labels_preds, lsuffix='_l', rsuffix='_r')
    betting_file_TESTING_DATA = betting_file_TESTING_DATA.dropna()
    betting_file_TESTING_DATA['Model'] = model_type
    betting_file = betting_file_TESTING_DATA

  betting_data = betting_file
  betting_data = betting_data.drop(['Date'], axis=1)
  test_set_games = len(betting_data)

  betting_data = betting_data[betting_data.Prediction != 0.0]
  betting_data['Prediction'] = betting_data['Prediction'].replace(1.0, 'Underdog covered')
  betting_data['Prediction'] = betting_data['Prediction'].replace(2.0, 'Favorite covered')

  betting_data['Favorite_Payout'] = round(bet_amount/(betting_data['Favorite_ML']/-100) + bet_amount, 2)
  betting_data['Underdog_Payout'] = round(bet_amount*(betting_data['Underdog_ML']/100) + bet_amount, 2)


  betting_data['Underdog_My_Payout'] = np.where(((betting_data.Status == 'Underdog covered') & (betting_data.Status == betting_data.Prediction)), 
                                     betting_data['Underdog_Payout'], -bet_amount)
  

  betting_data['Favorite_My_Payout'] = np.where(((betting_data.Status == 'Favorite covered') & (betting_data.Status == betting_data.Prediction)), 
                                      betting_data['Favorite_Payout'], -bet_amount)

  betting_data['My_Payout'] = np.where((betting_data.Favorite_My_Payout >-9), 
                                      betting_data['Favorite_My_Payout'], betting_data['Underdog_My_Payout'])

  # ##############
  # writer = pd.ExcelWriter('SML_Term_Project/NBA_Data/Model_Outputs/Betting_Output3.xlsx', engine='xlsxwriter')
  # betting_data.to_excel(writer, sheet_name='Sheet1', index=False)
  # writer.save()
  # ##############

  betting_columns = ['Model_Type', 'Num_Games_Bet', 'Payout', '% Games Bet On', 'Avg $ Lost per Bet', 'Games in Test Set']
  percent_bet = len(betting_data)/test_set_games*100
  avg_lost = betting_data['My_Payout'].sum()/(len(betting_data))
  betting_info = [model_type, len(betting_data), betting_data['My_Payout'].sum(), percent_bet, avg_lost, test_set_games]    

  bet_df = pd.DataFrame([betting_info], columns = betting_columns)

  model_outputs = model_outputs.join(bet_df)
  model_outputs['Date-Time'] = datetime.now().strftime("%d/%m/%Y %H:%M")
  model_outputs['Avg. Net Profit per Bet'] = (model_outputs['Avg $ Lost per Bet'] - bet_amount)
  model_outputs['Net Payout'] = (model_outputs['Payout'] - model_outputs['Num_Games_Bet']*bet_amount)

  ## Export Data to Excel Files
  if path.exists("SML_Term_Project/NBA_Data/Model_Outputs/Spread_Model_Outputs_Copy.xlsx"):
    writer = pd.ExcelWriter("SML_Term_Project/NBA_Data/Model_Outputs/Spread_Model_Outputs_Copy.xlsx", engine='openpyxl')
    writer.book = load_workbook("SML_Term_Project/NBA_Data/Model_Outputs/Spread_Model_Outputs_Copy.xlsx")
    writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
    reader = pd.read_excel(r"SML_Term_Project/NBA_Data/Model_Outputs/Spread_Model_Outputs_Copy.xlsx")
    model_outputs.to_excel(writer,index=False,header=False,startrow=len(reader)+1)
    writer.close()
  else:
    writer = pd.ExcelWriter('SML_Term_Project/NBA_Data/Model_Outputs/Spread_Model_Outputs_Copy.xlsx', engine='xlsxwriter')
    model_outputs.to_excel(writer, sheet_name='Sheet1', index=False)
    writer.save()

# RUN THE MODELS

## Basic Logistic Regression

In [None]:
# Trying to determine the most important features

log_reg = LogisticRegression(solver = 'liblinear').fit(x_scaled, y_train)

y_train_predict = log_reg.predict(x_scaled)
y_test_predict = log_reg.predict(x_test_scaled)

log_reg_cv_results = None
log_reg_cv_results = cross_validate(log_reg, x_scaled, y_train, cv=5, return_train_score=True, scoring = cv_scoring)

print_and_save_output("Test", "Logistic Regression", x_scaled, y_train, y_train_predict, x_test, y_test, y_test_predict, log_reg_cv_results)

## Randomized CV Search for Logistic Regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform


if run_logistic:
  log_basic = LogisticRegression(solver = 'liblinear')

  distributions = dict(C=uniform(loc=0, scale=4),
                      penalty=['l2', 'l1'])

  log_random = RandomizedSearchCV(log_basic, distributions, random_state=0).fit(x_scaled, y_train)
  log_random.best_params_

## Optimal Logistic Regression

In [None]:
if run_logistic:
  
  # This custom logistic does not perform any better than the generic one above
  log_reg = LogisticRegression(solver='liblinear', C=3.77, penalty='l1').fit(x_scaled, y_train)

  y_train_predict = log_reg.predict(x_scaled)
  y_test_predict = log_reg.predict(x_test_scaled)

  log_reg_cv_results = None
  log_reg_cv_results = cross_validate(log_reg, x_scaled, y_train, cv=5, return_train_score=True, scoring = cv_scoring)

  print_and_save_output("Test", "Logistic Regression - Optimized", x_scaled, y_train, y_train_predict, x_test, y_test, y_test_predict, log_reg_cv_results)
  
  if plot_confusions:
    plot_confusion_matrix(log_reg, x_scaled, y_train, display_labels=["Don't Bet", "Bet Underdog", "Bet Favorite"], normalize=None, values_format="")  
    plt.show() 

## Random Forest Randomized Search - Optimal Hyperparameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from pprint import pprint

if run_rf_random_search:

  n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
  max_features = ['auto', 'sqrt']
  max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  max_depth.append(None)
  min_samples_split = [4, 10, 12]
  min_samples_leaf = [1, 2, 4]
  bootstrap = [True, False]

  random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

  # pprint(random_grid)

  rf = RandomForestClassifier()
  rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
  rf_random.fit(x_scaled, y_train)
  # print(rf_random.best_params_)

  # # Random Forest Model
  clf = RandomForestClassifier(max_depth = rf_random.best_params_['max_depth'], 
                                n_estimators = rf_random.best_params_['n_estimators'], 
                                min_samples_split = rf_random.best_params_['min_samples_split'],
                                min_samples_leaf = rf_random.best_params_['min_samples_leaf'], 
                                random_state=0).fit(x_scaled, y_train) 

  rf_train_predictions = clf.predict(x_scaled)
  rf_test_predictions = clf.predict(x_test_scaled)

  cv_results = None
  cv_results = cross_validate(clf, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "RF-Optimal", x_scaled, y_train, rf_train_predictions, x_test, y_test, rf_test_predictions, cv_results)

  if plot_confusions:
    plot_confusion_matrix(clf, x_scaled, y_train, display_labels=["Don't Bet", "Bet Underdog", "Bet Favorite"], normalize=None, values_format="")  
    plt.show()  

## Random Forest - Optimal Parameters



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate

# # Random Forest Model
global clf_3

if run_random_forest_3:
  clf_3 = RandomForestClassifier(max_depth = rf_random.best_params_['max_depth'], 
                                 n_estimators = rf_random.best_params_['n_estimators'], 
                                 min_samples_split = rf_random.best_params_['min_samples_split'],
                                 min_samples_leaf = rf_random.best_params_['min_samples_leaf'], 
                                 random_state=0).fit(x_scaled, y_train) 

  rf_3_train_predictions = clf_3.predict(x_scaled)
  rf_3_test_predictions = clf_3.predict(x_test_scaled)

  cv_results_3 = None
  cv_results_3 = cross_validate(clf_3, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "RF3", x_scaled, y_train, rf_3_train_predictions, x_test, y_test, rf_3_test_predictions, cv_results_3)

  if plot_confusions:
    plot_confusion_matrix(clf_3, x_scaled, y_train, display_labels=["Don't Bet", "Bet Underdog", "Bet Favorite"], normalize=None, values_format="")  
    plt.show()  

  if plot_feature_importance:

    importance = clf_3.feature_importances_
  #   # summarize feature importance
  #   for i,v in enumerate(importance):
  #   	print('Feature: %0d, Score: %.5f' % (i,v))
  #  #  plot feature importance
  #   plt.bar([x for x in range(len(importance))], importance)
  #   plt.show()

    print(importance)

    top_indeces = sorted(range(len(importance)), key=lambda i: importance[i])[-8:]

    top_vals = [importance[i] for i in top_indeces]
    top_features = [x_scaled.columns[i] for i in top_indeces]

    print(top_vals)
    print(top_features)

## Basic SVC

In [None]:
from sklearn.svm import SVC
from scipy import stats
from sklearn.model_selection import GridSearchCV 

if run_svc:

  svc_c = SVC().fit(x_scaled, y_train)

  svc_train_predicted = svc_c.predict(x_scaled)
  svc_test_predicted = svc_c.predict(x_test_scaled)
  
  svc_cv = None
  svc_cv = cross_validate(svc_c, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "SVC - Optimal", x_scaled, y_train, svc_train_predicted, x_test, y_test, svc_test_predicted, svc_cv)

  if plot_confusions:
    plot_confusion_matrix(svc_c, x_scaled, y_train, display_labels=["Don't Bet", "Bet Underdog", "Bet Favorite"], normalize=None, values_format="")  
    plt.show() 

## SVC Randomized SearchCV for Optimal Hyperparameters

In [None]:
from scipy import stats
from sklearn.model_selection import GridSearchCV 

if run_svc:

  parameters = {"C": [0.001, 0.01, 0.1, 1, 10, 100], 
  "gamma": [0.001, 0.01, 0.1, 1, 10, 100]}

  search = GridSearchCV(SVC(), parameters, cv=5)
  search.fit(x_scaled, y_train)

  # print(search.best_params_)

## SVC with Optimal Hyperparameters

In [None]:
if run_svc:

  svc_c = SVC(C=search.best_params_['C'], gamma=search.best_params_['gamma']).fit(x_scaled, y_train)

  svc_train_predicted = svc_c.predict(x_scaled)
  svc_test_predicted = svc_c.predict(x_test_scaled)
  
  svc_cv = None
  svc_cv = cross_validate(svc_c, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "SVC - Optimal", x_scaled, y_train, svc_train_predicted, x_test, y_test, svc_test_predicted, svc_cv)

  if plot_confusions:
    plot_confusion_matrix(svc_c, x_scaled, y_train, display_labels=["Don't Bet", "Bet Underdog", "Bet Favorite"], normalize=None, values_format="")  
    plt.show() 

## Extra Trees Classifier

In [None]:
# https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

from sklearn.ensemble import ExtraTreesClassifier

if run_extra_trees:

  extra_class = ExtraTreesClassifier().fit(x_scaled, y_train)

  extra_train_predicted = extra_class.predict(x_scaled)
  extra_test_predicted = extra_class.predict(x_test_scaled)

  extra_cv = cross_validate(extra_class, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "Extra Trees", x_scaled, y_train, extra_train_predicted, x_test, y_test, extra_test_predicted, extra_cv)
  # print(extra_class.feature_importances_) #use inbuilt class feature_importances of tree based classifiers


  #plot graph of feature importances for better visualization
  if plot_feature_importance:
    feat_importances = pd.Series(extra_class.feature_importances_, index=x_scaled.columns)
    feat_importances.nlargest(8).plot(kind='bar')
    plt.show()

## XGB Classifier

In [None]:
from xgboost import XGBClassifier

if run_xgb:

  from sklearn import preprocessing
  lbl = preprocessing.LabelEncoder()

  cols = x_scaled.select_dtypes(exclude=['string']).columns

  x_scaled[cols] = x_scaled[cols].apply(pd.to_numeric, downcast='float', errors='coerce')
  x_test_scaled[cols] = x_test_scaled[cols].apply(pd.to_numeric, downcast='float', errors='coerce')

  x_scaled['Home_F'] = lbl.fit_transform(x_scaled['Home_F'].astype(int))
  x_test_scaled['Home_F'] = lbl.fit_transform(x_test_scaled['Home_F'].astype(int))
  x_scaled['Home_F'] = x_scaled['Home_F'].astype(bool)
  x_test_scaled['Home_F'] = x_test_scaled['Home_F'].astype(bool)

  xgb_classifier = XGBClassifier().fit(x_scaled, y_train)

  xgb_train_predicted = xgb_classifier.predict(x_scaled)
  xgb_test_predicted = xgb_classifier.predict(x_test_scaled)

  xgb_cv = None
  xgb_cv = cross_validate(xgb_classifier, x_scaled, y_train, cv=5, return_train_score= True, scoring = cv_scoring)

  print_and_save_output("Test", "XGB Boost", x_scaled, y_train, xgb_train_predicted, x_test, y_test, xgb_test_predicted, xgb_cv)
 

  if plot_feature_importance:
    # get importance
    importance = xgb_classifier.feature_importances_

    plt.bar([x for x in range(len(importance))], importance)
    plt.show()