# NBA Predictor: Basketball Advanced Regression, Kyle’s Loss Estimate Yielder (BARKLEY)

## Importing Modules

In [1781]:
import pandas as pd
pd.options.display.max_columns = None

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold,f_regression, SelectKBest
import matplotlib.pyplot as plt

## Loading Data

In [1731]:
# 2018 season will be used as validation
games2018 = pd.read_csv('data/games_2018_season.csv')
stats2018 = pd.read_csv('data/stats_2018_season.csv')

In [1669]:
df_games = pd.concat(
    [pd.read_csv('data/games_2008_season.csv'),
     pd.read_csv('data/games_2009_season.csv'),
     pd.read_csv('data/games_2010_season.csv'),
     pd.read_csv('data/games_2011_season.csv'),
     pd.read_csv('data/games_2012_season.csv'),
     pd.read_csv('data/games_2013_season.csv'),
     pd.read_csv('data/games_2014_season.csv'),
     pd.read_csv('data/games_2015_season.csv'),
     pd.read_csv('data/games_2016_season.csv'),
     pd.read_csv('data/games_2017_season.csv')]
)

In [1670]:
df_stats = pd.concat(
    [pd.read_csv('data/stats_2008_season.csv'),
     pd.read_csv('data/stats_2009_season.csv'),
     pd.read_csv('data/stats_2010_season.csv'),
     pd.read_csv('data/stats_2011_season.csv'),
     pd.read_csv('data/stats_2012_season.csv'),
     pd.read_csv('data/stats_2013_season.csv'),
     pd.read_csv('data/stats_2014_season.csv'),
     pd.read_csv('data/stats_2015_season.csv'),
     pd.read_csv('data/stats_2016_season.csv'),
     pd.read_csv('data/stats_2017_season.csv')]
)

I retrieved both games and stats from the Ball Don't Lie API, however I'll be working mainly with df_stats as it has much more info.

In [1050]:
df_games.shape

(12893, 23)

In [1051]:
df_games.columns

Index(['id', 'date', 'home_team_score', 'period', 'postseason', 'season',
       'status', 'time', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.city', 'home_team.conference',
       'home_team.division', 'home_team.full_name', 'home_team.name',
       'visitor_team.id', 'visitor_team.abbreviation', 'visitor_team.city',
       'visitor_team.conference', 'visitor_team.division',
       'visitor_team.full_name', 'visitor_team.name'],
      dtype='object')

In [239]:
games_columns = ['id', 'date', 'home_team_score', 'postseason', 'season', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.conference',
       'home_team.division',
       'visitor_team.id', 'visitor_team.abbreviation',
       'visitor_team.conference', 'visitor_team.division']

In [660]:
#filter only desired columns and regular season games, and sort by date
df_games = df_games[df_games['postseason'] == False].filter(games_columns).sort_values('date')

In [924]:
df_games.head()

Unnamed: 0,id,date,home_team_score,postseason,season,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division
91,21472,2008-10-28T00:00:00.000Z,90,False,2008,85,2,BOS,East,Atlantic,6,CLE,East,Central
124,21556,2008-10-28T00:00:00.000Z,108,False,2008,95,5,CHI,East,Central,17,MIL,East,Central
92,21473,2008-10-28T00:00:00.000Z,96,False,2008,76,14,LAL,West,Pacific,25,POR,West,Northwest
95,21476,2008-10-29T00:00:00.000Z,98,False,2008,103,27,SAS,West,Southwest,24,PHX,West,Pacific
94,21475,2008-10-29T00:00:00.000Z,100,False,2008,94,9,DET,East,Central,12,IND,East,Central


In [934]:
df_stats.head()

Unnamed: 0,id,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,game.id,game.date,game.home_team_id,game.home_team_score,game.period,game.postseason,game.season,game.status,game.time,game.visitor_team_id,game.visitor_team_score,player.id,player.first_name,player.height_feet,player.height_inches,player.last_name,player.position,player.team_id,player.weight_pounds,team.id,team.abbreviation,team.city,team.conference,team.division,team.full_name,team.name
0,581359,3.0,0.0,2.0,0.0,0.0,0.0,0.75,4.0,3.0,0.5,2.0,1.0,21:02,0.0,4.0,7.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,735,Shaquille,,,O'Neal,,22,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
1,581358,1.0,1.0,14.0,0.0,0.0,0.0,0.375,8.0,3.0,0.75,4.0,3.0,29:08:00,1.0,3.0,9.0,15.0,2.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,971,Kevin,,,Garnett,,18,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
2,581360,4.0,0.0,2.0,0.0,5.0,0.0,0.308,13.0,4.0,0.8,5.0,4.0,35:26:00,2.0,3.0,12.0,4.0,0.0,1.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1036,Ray,,,Allen,G,17,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
3,581364,0.0,0.0,2.0,0.0,0.0,0.0,0.5,2.0,1.0,0.0,0.0,0.0,12:22,0.0,6.0,2.0,2.0,0.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1059,Jermaine,,,O'Neal,,25,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
4,581372,6.0,0.0,2.0,0.5,4.0,2.0,0.333,12.0,4.0,0.0,2.0,0.0,34:54:00,0.0,1.0,10.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1139,Anthony,,,Parker,,23,,6,CLE,Cleveland,East,Central,Cleveland Cavaliers,Cavaliers


In [935]:
df_stats.columns

Index(['id', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga',
       'fgm', 'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'game.id', 'game.date', 'game.home_team_id',
       'game.home_team_score', 'game.period', 'game.postseason', 'game.season',
       'game.status', 'game.time', 'game.visitor_team_id',
       'game.visitor_team_score', 'player.id', 'player.first_name',
       'player.height_feet', 'player.height_inches', 'player.last_name',
       'player.position', 'player.team_id', 'player.weight_pounds', 'team.id',
       'team.abbreviation', 'team.city', 'team.conference', 'team.division',
       'team.full_name', 'team.name'],
      dtype='object')

## Feature Engineering

In [931]:
df_stats

Unnamed: 0,game.season,game.date,game.id,home,ast,blk,dreb,fg3a,fg3m,fga,fgm,fta,ftm,oreb,pf,pts,reb,stl,turnover,opp.id,team.id_copy
0,2010,2010-10-27T00:00:00.000Z,24362,1,24.0,1.0,32.0,12.0,3.0,72.0,34.0,21.0,16.0,6.0,24.0,87.0,38.0,8.0,19.0,6,2
1,2010,2010-10-27T00:00:00.000Z,24362,0,24.0,4.0,29.0,20.0,6.0,81.0,36.0,21.0,17.0,8.0,20.0,95.0,37.0,5.0,14.0,2,6
2,2010,2010-10-27T00:00:00.000Z,24363,1,20.0,9.0,35.0,16.0,8.0,80.0,42.0,31.0,27.0,9.0,20.0,119.0,44.0,7.0,15.0,15,1
3,2010,2010-10-27T00:00:00.000Z,24363,0,13.0,4.0,30.0,9.0,4.0,86.0,40.0,25.0,20.0,9.0,26.0,104.0,39.0,7.0,14.0,1,15
4,2010,2010-10-27T00:00:00.000Z,24364,1,21.0,6.0,36.0,14.0,2.0,93.0,40.0,22.0,13.0,15.0,28.0,95.0,51.0,6.0,14.0,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18719,2017,2018-04-11T00:00:00.000Z,34959,1,22.0,1.0,26.0,24.0,8.0,92.0,37.0,19.0,16.0,12.0,17.0,98.0,38.0,9.0,17.0,19,27
18720,2017,2018-04-11T00:00:00.000Z,34960,1,11.0,6.0,32.0,46.0,13.0,82.0,31.0,12.0,8.0,8.0,24.0,83.0,40.0,2.0,14.0,26,11
18721,2017,2018-04-11T00:00:00.000Z,34960,0,22.0,3.0,42.0,26.0,7.0,80.0,38.0,20.0,13.0,6.0,14.0,96.0,48.0,6.0,11.0,11,26
18722,2017,2018-04-11T00:00:00.000Z,35631,0,26.0,7.0,37.0,26.0,11.0,106.0,46.0,20.0,13.0,19.0,18.0,116.0,56.0,8.0,9.0,28,16


In [942]:
# function to group all stats by game

def group_stats_by_game(df):
    #remove postseason games
    df = df[df['game.postseason'] == False]
    
    #list of columns to keep
    stats_columns_to_keep = ['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga',
       'fgm', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']
    
    #keep only specified columns and sort by date
    df = df[stats_columns_to_keep].sort_values('game.date')
    
    # new field indicating whether team is at home
    df['home'] = np.where(df['team.id'] == df['game.home_team_id'], 1, 0)
    
    # group all stats by game/team
    df = df.groupby(['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','home']).sum().reset_index()
    
    # create column to indicate opponent id
    df['opp.id'] = np.where(df['team.id'] == df['game.home_team_id'], df['game.visitor_team_id'], df['game.home_team_id'])
    df.drop(['game.home_team_id','game.visitor_team_id'], axis=1,inplace=True)
    
    return df

In [943]:
#function that creates dataframe with opponent stats for each team
def opponent_stats(df):
    df['team.id_copy'] = df['team.id']
    df['home'] = np.where(df['home'] == 1, 0, 1)
    df.drop(['team.id'], axis=1,inplace=True)
    df.rename(columns={'opp.id':'team.id','team.id_copy':'opp.id','ast':'opp_ast', 'blk':'opp_blk',
       'dreb':'opp_dreb', 'fg3a':'opp_fg3a', 'fg3m':'opp_fg3m', 'fga':'opp_fga', 'fgm':'opp_fgm', 'fta':'opp_fta', 'ftm':'opp_ftm', 'oreb':'opp_oreb', 'pf':'opp_pf', 'pts':'opp_pts',
       'reb':'opp_reb', 'stl':'opp_stl', 'turnover':'opp_turnover'},inplace=True)

    return df

In [973]:
#function that creates a dataframe with opponent stats grouped by game
def prepare_stats_df(df):
    df_team = group_stats_by_game(df)
    df_opponent = opponent_stats(group_stats_by_game(df))
    #merge df and df_opponents
    df = pd.merge(df_team, df_opponent, how='outer', on=['game.season', 'game.date','game.id','team.id','opp.id','home'])
    return df

In [1058]:
#creates columns with average team and opponent point totals
def create_average_scoring(df):
    #team stats
    # group by season and team and calculates a rolling mean of last x number of games with period of 1. Shift() to exclude the current row to only look at past rows
    df['score_avg_season'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['score_avg_l10'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pts'].transform(lambda x: x.rolling(8, 1).mean().shift())
    
    #opponent stats
    df['opp_score_avg_season'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['opp_score_avg_l10'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['opp_pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['opp_pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['opp_score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['opp_pts'].transform(lambda x: x.rolling(8, 1).mean().shift())
    
    return df

In [1059]:
def create_point_diff(df):
    # calculate point differential for the current game
    df['pt_diff'] = df['pts'] - df['opp_pts']

    # calculate average point differential for each team for the season (window size 82)
    df['pt_diff_avg_season'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # calculate average point differential for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pt_diff'].transform(lambda x: x.rolling(41, 1).mean().shift())

    #avg diff over the last 10 games (same season)
    df['pt_diff_avg_l10'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['pt_diff_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pt_diff'].transform(lambda x: x.rolling(8, 1).mean().shift())

    df.drop(['pt_diff'], axis=1, inplace=True)
    
    return df

In [1060]:
def create_efg_pct(df):

    #Effective field goal percentage = (fgm + (.5 * 3pm)) / fga
    df['fgm_total'] = df.groupby(['game.season', 'team.id'])['fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fg3m_total'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fga_total'] = df.groupby(['game.season', 'team.id'])['fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['efg_pct'] = (df['fgm_total'] + (0.5 * df['fg3m_total'])) / df['fga_total']
    df.drop(['fgm_total','fg3m_total','fga_total'], axis=1, inplace=True)

    df['opp_fgm_total'] = df.groupby(['game.season', 'team.id'])['opp_fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fg3m_total'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fga_total'] = df.groupby(['game.season', 'team.id'])['opp_fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_efg_pct'] = (df['opp_fgm_total'] + (0.5 * df['opp_fg3m_total'])) / df['opp_fga_total']
    df.drop(['opp_fgm_total','opp_fg3m_total','opp_fga_total'], axis=1, inplace=True)
    
    return df


In [1228]:
def create_avg_boxscr_stats(df):
    df['fg3m_avg'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_fg3m_avg'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['ftm_avg'] = df.groupby(['game.season', 'team.id'])['ftm'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_ftm_avg'] = df.groupby(['game.season', 'team.id'])['opp_ftm'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['ast_avg'] = df.groupby(['game.season', 'team.id'])['ast'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_ast_avg'] = df.groupby(['game.season', 'team.id'])['opp_ast'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['turnover_avg'] = df.groupby(['game.season', 'team.id'])['turnover'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_turnover_avg'] = df.groupby(['game.season', 'team.id'])['opp_turnover'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['reb_avg'] = df.groupby(['game.season', 'team.id'])['reb'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_reb'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['oreb_avg'] = df.groupby(['game.season', 'team.id'])['oreb'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_oreb'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['stl_avg'] = df.groupby(['game.season', 'team.id'])['stl'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_stl_avg'] = df.groupby(['game.season', 'team.id'])['opp_stl'].transform(lambda x: x.rolling(10,1).mean().shift())

    df['blk_avg'] = df.groupby(['game.season', 'team.id'])['blk'].transform(lambda x: x.rolling(10,1).mean().shift())
    df['opp_blk_avg'] = df.groupby(['game.season', 'team.id'])['opp_blk'].transform(lambda x: x.rolling(10,1).mean().shift())

    return df

In [1062]:
def feat_engineering(df):
    df = create_average_scoring(df)
    df = create_point_diff(df)
    df = create_efg_pct(df)
    df = create_avg_boxscr_stats(df)
    
    #drop some of the raw non-aggregated stats
    df.drop(['ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga', 'fgm', 'fta', 'ftm', 'oreb', 'pf', 'reb', 'stl', 'turnover',
               'opp_ast', 'opp_blk', 'opp_dreb', 'opp.id','opp_fg3a', 'opp_fg3m', 'opp_fga', 'opp_fgm', 'opp_fta', 'opp_ftm', 'opp_oreb', 'opp_pf', 'opp_reb', 'opp_stl', 'opp_turnover','opp_pts'],
              axis=1, inplace=True)
    return df

In [1063]:
# puts the stats for each team in a game onto the same row
def combine_games(df):
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    df = pd.merge(df_home, df_away, how='outer', on=['game.season', 'game.date','game.id'])
    return df

In [1064]:
def preprocess_df(df):
    df = prepare_stats_df(df)
    df = feat_engineering(df)
    df = combine_games(df)
    # winner column. 1 is when home team wins
    df['winner'] = np.where(df['pts_x'] > df['pts_y'], 1, 0)
    return df

Apply the above functions to prepare the stats df and the validation dataset

In [1675]:
df_stats_prepped = preprocess_df(df_stats)
df_val = preprocess_df(stats2018)

## Feature Selection

In [1814]:
columns_to_drop_clfX = ['game.season','game.date','game.id','team.id_x','home_x','pts_x','team.id_y','home_y','pts_y','winner']
columns_to_drop_regrX = ['game.season','game.date','game.id','team.id_x','home_x','team.id_y','home_y','winner']

df_stats_prepped.dropna(axis=0, inplace=True)
y = df_stats_prepped['winner']
df_X = df_stats_prepped.drop(columns_to_drop_clfX, axis =1)

val_X = df_val.drop(columns_to_drop_clfX, axis=1)
val_X = val_X.apply(lambda x: x.fillna(x.mean()),axis=0)

df_X.dropna(axis=0, inplace=True)

In [1815]:
val_X = df_val.drop(columns_to_drop_clfX, axis=1)
val_X = val_X.apply(lambda x: x.fillna(x.mean()),axis=0)
val_y = df_val['winner']

In [1816]:
val_X

Unnamed: 0,score_avg_season_x,score_avg_l10_x,pt_diff_avg_home_x,score_avg_h2hl8_x,opp_score_avg_season_x,opp_score_avg_l10_x,opp_pt_diff_avg_home_x,opp_score_avg_h2hl8_x,pt_diff_avg_season_x,pt_diff_avg_l10_x,pt_diff_avg_h2hl8_x,efg_pct_x,opp_efg_pct_x,fg3m_avg_x,opp_fg3m_avg_x,ftm_avg_x,opp_ftm_avg_x,ast_avg_x,opp_ast_avg_x,turnover_avg_x,opp_turnover_avg_x,reb_avg_x,opp_reb_avg_x,oreb_avg_x,stl_avg_x,opp_stl_avg_x,blk_avg_x,opp_blk_avg_x,score_avg_season_y,score_avg_l10_y,pt_diff_avg_home_y,score_avg_h2hl8_y,opp_score_avg_season_y,opp_score_avg_l10_y,opp_pt_diff_avg_home_y,opp_score_avg_h2hl8_y,pt_diff_avg_season_y,pt_diff_avg_l10_y,pt_diff_avg_h2hl8_y,efg_pct_y,opp_efg_pct_y,fg3m_avg_y,opp_fg3m_avg_y,ftm_avg_y,opp_ftm_avg_y,ast_avg_y,opp_ast_avg_y,turnover_avg_y,opp_turnover_avg_y,reb_avg_y,opp_reb_avg_y,oreb_avg_y,stl_avg_y,opp_stl_avg_y,blk_avg_y,opp_blk_avg_y
0,110.714475,111.000549,2.863388,109.958071,110.803359,111.153215,109.370002,111.117191,-0.088884,-0.152667,-1.159119,0.520975,0.521747,11.288164,11.287688,17.861697,17.822141,24.434994,24.485238,13.656271,13.616294,45.162752,10.358863,10.37066,7.624199,7.626149,4.984673,5.005519,110.744857,111.195023,-2.925914,111.117191,110.654779,111.047181,112.219095,109.958071,0.090079,0.147842,1.159119,0.521475,0.521175,11.282728,11.280612,17.809874,17.857247,24.485557,24.443939,13.604756,13.65047,45.111345,10.372011,10.356496,7.632212,7.629202,5.018427,4.979619
1,110.714475,111.000549,2.863388,109.958071,110.803359,111.153215,109.370002,111.117191,-0.088884,-0.152667,-1.159119,0.520975,0.521747,11.288164,11.287688,17.861697,17.822141,24.434994,24.485238,13.656271,13.616294,45.162752,10.358863,10.37066,7.624199,7.626149,4.984673,5.005519,110.744857,111.195023,-2.925914,111.117191,110.654779,111.047181,112.219095,109.958071,0.090079,0.147842,1.159119,0.521475,0.521175,11.282728,11.280612,17.809874,17.857247,24.485557,24.443939,13.604756,13.65047,45.111345,10.372011,10.356496,7.632212,7.629202,5.018427,4.979619
2,110.714475,111.000549,2.863388,109.958071,110.803359,111.153215,109.370002,111.117191,-0.088884,-0.152667,-1.159119,0.520975,0.521747,11.288164,11.287688,17.861697,17.822141,24.434994,24.485238,13.656271,13.616294,45.162752,10.358863,10.37066,7.624199,7.626149,4.984673,5.005519,110.744857,111.195023,-2.925914,111.117191,110.654779,111.047181,112.219095,109.958071,0.090079,0.147842,1.159119,0.521475,0.521175,11.282728,11.280612,17.809874,17.857247,24.485557,24.443939,13.604756,13.65047,45.111345,10.372011,10.356496,7.632212,7.629202,5.018427,4.979619
3,110.714475,111.000549,2.863388,109.958071,110.803359,111.153215,109.370002,111.117191,-0.088884,-0.152667,-1.159119,0.520975,0.521747,11.288164,11.287688,17.861697,17.822141,24.434994,24.485238,13.656271,13.616294,45.162752,10.358863,10.37066,7.624199,7.626149,4.984673,5.005519,110.744857,111.195023,-2.925914,111.117191,110.654779,111.047181,112.219095,109.958071,0.090079,0.147842,1.159119,0.521475,0.521175,11.282728,11.280612,17.809874,17.857247,24.485557,24.443939,13.604756,13.65047,45.111345,10.372011,10.356496,7.632212,7.629202,5.018427,4.979619
4,110.714475,111.000549,2.863388,109.958071,110.803359,111.153215,109.370002,111.117191,-0.088884,-0.152667,-1.159119,0.520975,0.521747,11.288164,11.287688,17.861697,17.822141,24.434994,24.485238,13.656271,13.616294,45.162752,10.358863,10.37066,7.624199,7.626149,4.984673,5.005519,110.744857,111.195023,-2.925914,111.117191,110.654779,111.047181,112.219095,109.958071,0.090079,0.147842,1.159119,0.521475,0.521175,11.282728,11.280612,17.809874,17.857247,24.485557,24.443939,13.604756,13.65047,45.111345,10.372011,10.356496,7.632212,7.629202,5.018427,4.979619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,118.148148,121.800000,12.650000,112.000000,109.037037,113.000000,107.300000,118.000000,9.111111,8.800000,-6.000000,0.550475,0.501783,14.000000,15.800000,19.800000,14.400000,26.300000,24.500000,12.300000,12.100000,52.100000,10.700000,9.00000,7.700000,6.000000,6.000000,4.700000,114.074074,113.400000,1.725000,118.000000,110.654321,111.700000,112.425000,112.000000,3.419753,1.700000,6.000000,0.512842,0.523070,13.200000,14.000000,14.600000,16.100000,24.600000,24.800000,12.000000,14.40000,46.700000,7.800000,14.500000,7.700000,7.200000,4.700000,4.600000
1226,111.740741,110.600000,6.750000,110.000000,110.172840,109.100000,106.250000,104.666667,1.567901,1.500000,5.333333,0.534627,0.529055,10.200000,11.400000,14.600000,16.300000,26.300000,23.100000,11.400000,11.000000,45.400000,8.800000,10.40000,5.600000,7.300000,4.900000,2.900000,109.049383,114.300000,-4.750000,104.666667,110.209877,111.000000,112.500000,110.000000,-1.160494,3.300000,-5.333333,0.518979,0.521577,14.400000,10.900000,18.300000,19.700000,26.900000,24.500000,10.800000,9.80000,49.200000,10.100000,10.600000,4.100000,6.200000,4.100000,4.700000
1227,110.814815,102.300000,10.675000,114.333333,106.864198,107.200000,103.825000,104.666667,3.950617,-4.900000,9.666667,0.527207,0.521093,9.000000,9.500000,14.700000,15.300000,25.100000,23.900000,12.300000,13.000000,43.100000,12.000000,10.00000,8.000000,7.200000,3.600000,5.100000,112.691358,113.800000,-5.925000,104.666667,114.160494,118.300000,116.625000,114.333333,-1.469136,-4.500000,-9.666667,0.511361,0.538224,10.800000,13.000000,19.000000,15.900000,25.300000,26.900000,12.700000,14.30000,42.100000,11.800000,8.600000,8.200000,7.100000,4.900000,5.000000
1228,114.790123,115.800000,3.200000,107.000000,114.012346,116.100000,114.050000,120.000000,0.777778,-0.300000,-13.000000,0.528218,0.513676,11.400000,10.800000,21.200000,19.100000,27.000000,23.700000,14.100000,12.800000,43.400000,10.200000,7.60000,8.000000,9.100000,4.300000,5.200000,111.407407,117.100000,2.875000,120.000000,106.012346,103.400000,107.275000,107.000000,5.395062,13.700000,13.000000,0.538489,0.506295,12.400000,8.800000,18.500000,16.000000,28.100000,21.200000,14.800000,12.70000,47.000000,9.800000,9.000000,7.400000,8.200000,5.400000,3.400000


In [1817]:
# remove features with small variance
vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_X)
selected_columns = df_X.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)

# remove highly correlated pairs
df_corr = df_transformed.corr().abs()
indices = np.where(df_corr > 0.9)
indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices)
              if x != y and x < y]
for idx in indices:
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

#select kbest
skb = SelectKBest(f_regression, k=30)
X = skb.fit_transform(df_transformed, y)
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])

In [1818]:
feats = X.columns.to_list()
val_X = val_X[feats]

In [1819]:
feats

['score_avg_season_x',
 'pt_diff_avg_home_x',
 'opp_score_avg_season_x',
 'opp_score_avg_h2hl8_x',
 'pt_diff_avg_season_x',
 'pt_diff_avg_l10_x',
 'pt_diff_avg_h2hl8_x',
 'opp_ast_avg_x',
 'pt_diff_avg_home_y',
 'pt_diff_avg_season_y',
 'pt_diff_avg_l10_y',
 'opp_ast_avg_y']

In [1820]:
def standard_scale(df):
    scaler = StandardScaler()
    X = scaler.fit_transform(df)
    return X

In [1821]:
X = standard_scale(X)
val_X = standard_scale(val_X)


## Classification Model

In [1822]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.80,random_state=1,stratify=y)

#### Random Forest Classifier

In [1823]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
grid_search_rf = GridSearchCV(
    clf,
    {
        'n_estimators': [10,20,40,60],
        'max_depth': [4,6,8,10],
        'min_samples_leaf': [10,30,50,70],
    },
    cv=5,
    n_jobs=-1,
    scoring='precision' #['precision', 'recall', 'f1']
)  
grid_search_rf.fit(X_train,y_train)
print('Best params:',grid_search_rf.best_params_)
print('Best score:',grid_search_rf.best_score_)

Best params: {'max_depth': 10, 'min_samples_leaf': 50, 'n_estimators': 10}
Best score: 0.6941299521753164


In [1824]:
y_pred=grid_search_rf.predict(X_test)

In [1825]:
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'Precision score is: {precision_score(y_test, y_pred)}')
print(f'Recall score is: {recall_score(y_test, y_pred)}')
print(f'f1 score is: {f1_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6534608378870674
Precision score is: 0.6816632583503749
Recall score is: 0.7727975270479135
f1 score is: 0.7243752263672584
ROC AUC score is: 0.6275295839230699
[[ 435  467]
 [ 294 1000]]


#### SVM Classifier

In [1801]:
from sklearn.svm import SVC

In [1774]:
clf2 = SVC(probability=True)

grid_search_svm = GridSearchCV(
    clf2,
    {
        'C': [.1, .5, 1, 1.5, 2],
        'kernel': ['poly', 'rbf'],
    },
    cv=5,
    n_jobs=-1,
    scoring='precision' #['precision', 'recall', 'f1']
)  
grid_search_svm.fit(X_train,y_train)
print('Best params:',grid_search_svm.best_params_)
print('Best score:',grid_search_svm.best_score_)
clf2.fit(X_train,y_train)

Best params: {'C': 2, 'kernel': 'rbf'}
Best score: 0.6807671364389476


SVC(probability=True)

In [1782]:
y_pred=grid_search_svm.predict(X_test)

In [1783]:
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'Precision score is: {precision_score(y_test, y_pred)}')
print(f'Recall score is: {recall_score(y_test, y_pred)}')
print(f'f1 score is: {f1_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6434426229508197
Precision score is: 0.6679815910585142
Recall score is: 0.7851622874806801
f1 score is: 0.7218472468916519
ROC AUC score is: 0.6126476625873467
[[ 397  505]
 [ 278 1016]]


## Predicting 2018 Season

The SVM Classifier seems to be outperforming the randomforests slightly, so let's use that to predict the 2018 season

In [1757]:
pred_y = grid_search_svm.predict(val_X)
pred_y_proba = grid_search_svm.predict_proba(val_X) 

In [1763]:
print(f'Accuracy score is: {accuracy_score(val_y, pred_y)}')
print(f'Precision score is: {precision_score(val_y, pred_y)}')
print(f'Recall score is: {recall_score(val_y, pred_y)}')
print(f'ROC AUC score is: {roc_auc_score(val_y, pred_y)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6593495934959349
Precision score is: 0.6781609195402298
Recall score is: 0.8093278463648834
ROC AUC score is: 0.6252228054179707
[[ 432  489]
 [ 273 1002]]


## Exploring Results

In [1835]:
df2018 = df_val[['game.date','game.id','team.id_x','team.id_y','pts_x','pts_y','winner']]

In [1745]:
predicted_2018 = pd.DataFrame(pred_y).rename(columns={0: 'predicted_winner'})
predicted_2018_proba = pd.DataFrame(pred_y_proba).rename(columns={0: 'away_team_win_proba', 1: 'home_team_win_proba'})

In [1836]:
df2018 = pd.concat([df2018, predicted_2018, predicted_2018_proba], axis=1)
df2018['proba_diff'] = abs(df2018['home_team_win_proba'] - df2018['away_team_win_proba'])

dfgames2018 = games2018[games2018['postseason'] == False]
dfgames2018 = dfgames2018[['id','home_team.abbreviation','visitor_team.abbreviation','home_team.conference','visitor_team.conference']]
dfgames2018 = dfgames2018.rename(columns = {'id':'game.id'})

df2018 = pd.merge(df2018, dfgames2018, how='outer', on=['game.id'])
df2018['pts_diff'] = abs(df2018['pts_x'] - df2018['pts_y'])
df2018['correct'] = np.where(df2018['winner'] == df2018['predicted_winner'], True, False)

In [1837]:
df2018

Unnamed: 0,game.date,game.id,team.id_x,team.id_y,pts_x,pts_y,winner,predicted_winner,away_team_win_proba,home_team_win_proba,proba_diff,home_team.abbreviation,visitor_team.abbreviation,home_team.conference,visitor_team.conference,pts_diff,correct
0,2018-10-16T00:00:00.000Z,1,2,23,103.0,87.0,1,1,0.333552,0.666448,0.332896,BOS,PHI,East,East,16.0,True
1,2018-10-16T00:00:00.000Z,2,10,21,108.0,100.0,1,1,0.333552,0.666448,0.332896,GSW,OKC,West,West,8.0,True
2,2018-10-17T00:00:00.000Z,3,4,17,112.0,113.0,0,1,0.333552,0.666448,0.332896,CHA,MIL,East,East,1.0,False
3,2018-10-17T00:00:00.000Z,4,9,3,103.0,100.0,1,1,0.333552,0.666448,0.332896,DET,BKN,East,East,3.0,True
4,2018-10-17T00:00:00.000Z,5,12,15,109.0,83.0,1,1,0.333552,0.666448,0.332896,IND,MEM,East,West,26.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2019-04-10T00:00:00.000Z,49149,17,21,116.0,127.0,0,1,0.234703,0.765297,0.530595,MIL,OKC,East,West,11.0,False
1226,2019-04-10T00:00:00.000Z,49150,27,7,105.0,94.0,1,1,0.305074,0.694926,0.389851,SAS,DAL,West,West,11.0,True
1227,2019-04-10T00:00:00.000Z,49151,8,18,99.0,95.0,1,1,0.253196,0.746804,0.493607,DEN,MIN,West,West,4.0,True
1228,2019-04-10T00:00:00.000Z,49152,13,29,143.0,137.0,1,0,0.622002,0.377998,0.244005,LAC,UTA,West,West,6.0,False


In [1748]:
df = df2018[['correct','proba_diff','pts_diff']]
df.groupby('correct').mean()

Unnamed: 0_level_0,proba_diff,pts_diff
correct,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.296079,9.613365
True,0.354003,12.843403


In [1749]:
df = df2018
df['month_year'] = pd.to_datetime(df['game.date']).dt.to_period('M')
df = df[['month_year','correct','game.id']]
df = df.groupby(['month_year','correct']).count().unstack().reset_index().values
df = pd.DataFrame(df).rename(columns={0: 'month_year', 1:'false', 2:'true'})
df['pct'] = df['true'] / (df['true'] + df['false'])
df



Unnamed: 0,month_year,false,true,pct
0,2018-10,34,76,0.690909
1,2018-11,86,133,0.607306
2,2018-12,75,144,0.657534
3,2019-01,64,157,0.710407
4,2019-02,63,95,0.601266
5,2019-03,69,155,0.691964
6,2019-04,28,51,0.64557


In [1851]:
df_home = df2018.copy()
df_home = df_home[['home_team.abbreviation','winner','predicted_winner','home_team.conference']]
df_home = df_home.groupby(['home_team.abbreviation','home_team.conference'])[['winner','predicted_winner']].sum().reset_index()
df_home['BARKLEY ']
df_home = df_home.rename(columns={'home_team.abbreviation':"Team","home_team.conference":"Conference","winner": "Home Wins", "predicted_winner":"Pred Home Wins"})

In [1852]:
df_away = df2018.copy()
df_away = df_away[['visitor_team.abbreviation','winner','predicted_winner','visitor_team.conference']]
df_away['winner'] = np.where(df_away['winner'] == 0, 1, 0)
df_away['predicted_winner'] = np.where(df_away['predicted_winner'] == 0, 1, 0)
df_away = df_away.groupby(['visitor_team.abbreviation','visitor_team.conference'])[['winner','predicted_winner']].sum().reset_index()
df_away = df_away.rename(columns={'visitor_team.abbreviation':"Team",'visitor_team.conference':'Conference',"winner": "Away Wins", "predicted_winner":"Pred Away Wins"})

In [1900]:
df_predicted2018 = pd.merge(df_home, df_away, on=['Team','Conference'])
df_predicted2018['Wins'] = df_predicted2018['Home Wins'] + df_predicted2018['Away Wins']
df_predicted2018['BARKLEY Wins'] = df_predicted2018['Pred Home Wins'] + df_predicted2018['Pred Away Wins']
df_predicted2018['Wins Diff'] = df_predicted2018['BARKLEY Wins'] - df_predicted2018['Wins']

df_predicted2018East = df_predicted2018.copy()
df_predicted2018East = df_predicted2018East[df_predicted2018East['Conference'] == "East"]
df_predicted2018East.drop(['Conference'], axis=1, inplace=True)

df_predicted2018West = df_predicted2018.copy()
df_predicted2018West = df_predicted2018West[df_predicted2018West['Conference'] == "West"]
df_predicted2018West.drop(['Conference'], axis=1, inplace=True)

In [1901]:
df_predicted2018East.sort_values('BARKLEY Wins', ascending=False, inplace = True)
df_predicted2018East['2018 Standings'] = df_predicted2018East['Wins'].rank(ascending=False)
df_predicted2018East['BARKLEY Standings'] = df_predicted2018East['BARKLEY Wins'].rank(ascending=False)
df_predicted2018East['Standings Diff'] = df_predicted2018East['BARKLEY Standings'] - df_predicted2018East['2018 Standings']
df_predicted2018East


Unnamed: 0,Team,Home Wins,Pred Home Wins,Away Wins,Pred Away Wins,Wins,BARKLEY Wins,Wins Diff,2018 Standings,BARKLEY Standings,Standings Diff
16,MIL,33,41,27,31,60,72,12,1.0,1.0,0.0
27,TOR,32,41,26,26,58,67,9,2.0,2.0,0.0
11,IND,29,39,19,21,48,60,12,5.0,3.0,-2.0
2,BOS,28,39,21,19,49,58,9,4.0,4.0,0.0
22,PHI,31,38,20,19,51,57,6,3.0,5.0,2.0
3,CHA,25,35,14,12,39,47,8,9.5,6.0,-3.5
1,BKN,23,32,19,12,42,44,2,6.5,7.0,0.5
15,MIA,19,29,20,12,39,41,2,9.5,8.0,-1.5
8,DET,26,28,15,9,41,37,-4,8.0,9.0,1.0
29,WAS,22,20,10,6,32,26,-6,11.0,10.0,-1.0


In [1902]:
df_predicted2018West.sort_values('BARKLEY Wins', ascending=False, inplace = True)
df_predicted2018West['2018 Standings'] = df_predicted2018West['Wins'].rank(ascending=False)
df_predicted2018West['BARKLEY Standings'] = df_predicted2018West['BARKLEY Wins'].rank(ascending=False)
df_predicted2018West['Standings Diff'] = df_predicted2018West['BARKLEY Standings'] - df_predicted2018West['2018 Standings']

df_predicted2018West


Unnamed: 0,Team,Home Wins,Pred Home Wins,Away Wins,Pred Away Wins,Wins,BARKLEY Wins,Wins Diff,2018 Standings,BARKLEY Standings,Standings Diff
9,GSW,30,41,27,29,57,70,13,1.0,1.0,0.0
7,DEN,34,41,20,18,54,59,5,2.0,2.5,0.5
28,UTA,29,38,21,21,50,59,9,5.0,2.5,-2.5
20,OKC,27,37,22,15,49,52,3,6.0,4.0,-2.0
10,HOU,31,37,22,14,53,51,-2,3.5,5.0,1.5
24,POR,32,35,21,15,53,50,-3,3.5,6.0,2.5
26,SAS,32,34,16,11,48,45,-3,7.5,7.0,-0.5
18,NOP,19,34,14,10,33,44,11,13.0,8.0,-5.0
12,LAC,26,30,22,11,48,41,-7,7.5,9.0,1.5
14,MEM,21,34,12,4,33,38,5,13.0,10.0,-3.0
