# NBA Predictor: Basketball Advanced Regression, Kyle’s Loss Estimate Yielder (BARKLEY)

## Importing Modules

In [871]:
import pandas as pd
pd.options.display.max_columns = None

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

## Loading Data

In [234]:
# 2018 season will be used as validation
games2018 = pd.read_csv('data/games_2018_season.csv')
stats2018 = pd.read_csv('data/stats_2018_season.csv')

In [235]:
df_games = pd.concat(
    [pd.read_csv('data/games_2008_season.csv'),
     pd.read_csv('data/games_2009_season.csv'),
     pd.read_csv('data/games_2010_season.csv'),
     pd.read_csv('data/games_2011_season.csv'),
     pd.read_csv('data/games_2012_season.csv'),
     pd.read_csv('data/games_2013_season.csv'),
     pd.read_csv('data/games_2014_season.csv'),
     pd.read_csv('data/games_2015_season.csv'),
     pd.read_csv('data/games_2016_season.csv'),
     pd.read_csv('data/games_2017_season.csv')]
)

In [962]:
df_stats = pd.concat(
    [pd.read_csv('data/stats_2010_season.csv'),
     pd.read_csv('data/stats_2011_season.csv'),
     pd.read_csv('data/stats_2012_season.csv'),
     pd.read_csv('data/stats_2013_season.csv'),
     pd.read_csv('data/stats_2014_season.csv'),
     pd.read_csv('data/stats_2015_season.csv'),
     pd.read_csv('data/stats_2016_season.csv'),
     pd.read_csv('data/stats_2017_season.csv')]
)

I retrieved both games and stats from the Ball Don't Lie API, however I'll be working mainly with df_stats as it has much more info.

In [659]:
df_games.shape

(12059, 14)

In [238]:
df_games.columns

Index(['id', 'date', 'home_team_score', 'period', 'postseason', 'season',
       'status', 'time', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.city', 'home_team.conference',
       'home_team.division', 'home_team.full_name', 'home_team.name',
       'visitor_team.id', 'visitor_team.abbreviation', 'visitor_team.city',
       'visitor_team.conference', 'visitor_team.division',
       'visitor_team.full_name', 'visitor_team.name'],
      dtype='object')

In [239]:
games_columns = ['id', 'date', 'home_team_score', 'postseason', 'season', 'visitor_team_score', 'home_team.id',
       'home_team.abbreviation', 'home_team.conference',
       'home_team.division',
       'visitor_team.id', 'visitor_team.abbreviation',
       'visitor_team.conference', 'visitor_team.division']

In [660]:
#filter only desired columns and regular season games, and sort by date
df_games = df_games[df_games['postseason'] == False].filter(games_columns).sort_values('date')

In [924]:
df_games.head()

Unnamed: 0,id,date,home_team_score,postseason,season,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division
91,21472,2008-10-28T00:00:00.000Z,90,False,2008,85,2,BOS,East,Atlantic,6,CLE,East,Central
124,21556,2008-10-28T00:00:00.000Z,108,False,2008,95,5,CHI,East,Central,17,MIL,East,Central
92,21473,2008-10-28T00:00:00.000Z,96,False,2008,76,14,LAL,West,Pacific,25,POR,West,Northwest
95,21476,2008-10-29T00:00:00.000Z,98,False,2008,103,27,SAS,West,Southwest,24,PHX,West,Pacific
94,21475,2008-10-29T00:00:00.000Z,100,False,2008,94,9,DET,East,Central,12,IND,East,Central


In [934]:
df_stats.head()

Unnamed: 0,id,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,game.id,game.date,game.home_team_id,game.home_team_score,game.period,game.postseason,game.season,game.status,game.time,game.visitor_team_id,game.visitor_team_score,player.id,player.first_name,player.height_feet,player.height_inches,player.last_name,player.position,player.team_id,player.weight_pounds,team.id,team.abbreviation,team.city,team.conference,team.division,team.full_name,team.name
0,581359,3.0,0.0,2.0,0.0,0.0,0.0,0.75,4.0,3.0,0.5,2.0,1.0,21:02,0.0,4.0,7.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,735,Shaquille,,,O'Neal,,22,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
1,581358,1.0,1.0,14.0,0.0,0.0,0.0,0.375,8.0,3.0,0.75,4.0,3.0,29:08:00,1.0,3.0,9.0,15.0,2.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,971,Kevin,,,Garnett,,18,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
2,581360,4.0,0.0,2.0,0.0,5.0,0.0,0.308,13.0,4.0,0.8,5.0,4.0,35:26:00,2.0,3.0,12.0,4.0,0.0,1.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1036,Ray,,,Allen,G,17,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
3,581364,0.0,0.0,2.0,0.0,0.0,0.0,0.5,2.0,1.0,0.0,0.0,0.0,12:22,0.0,6.0,2.0,2.0,0.0,3.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1059,Jermaine,,,O'Neal,,25,,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics
4,581372,6.0,0.0,2.0,0.5,4.0,2.0,0.333,12.0,4.0,0.0,2.0,0.0,34:54:00,0.0,1.0,10.0,2.0,0.0,2.0,24362,2010-10-27T00:00:00.000Z,6,95,4,False,2010,Final,,2,87,1139,Anthony,,,Parker,,23,,6,CLE,Cleveland,East,Central,Cleveland Cavaliers,Cavaliers


In [935]:
df_stats.columns

Index(['id', 'ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga',
       'fgm', 'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'game.id', 'game.date', 'game.home_team_id',
       'game.home_team_score', 'game.period', 'game.postseason', 'game.season',
       'game.status', 'game.time', 'game.visitor_team_id',
       'game.visitor_team_score', 'player.id', 'player.first_name',
       'player.height_feet', 'player.height_inches', 'player.last_name',
       'player.position', 'player.team_id', 'player.weight_pounds', 'team.id',
       'team.abbreviation', 'team.city', 'team.conference', 'team.division',
       'team.full_name', 'team.name'],
      dtype='object')

## Feature Engineering

In [931]:
df_stats

Unnamed: 0,game.season,game.date,game.id,home,ast,blk,dreb,fg3a,fg3m,fga,fgm,fta,ftm,oreb,pf,pts,reb,stl,turnover,opp.id,team.id_copy
0,2010,2010-10-27T00:00:00.000Z,24362,1,24.0,1.0,32.0,12.0,3.0,72.0,34.0,21.0,16.0,6.0,24.0,87.0,38.0,8.0,19.0,6,2
1,2010,2010-10-27T00:00:00.000Z,24362,0,24.0,4.0,29.0,20.0,6.0,81.0,36.0,21.0,17.0,8.0,20.0,95.0,37.0,5.0,14.0,2,6
2,2010,2010-10-27T00:00:00.000Z,24363,1,20.0,9.0,35.0,16.0,8.0,80.0,42.0,31.0,27.0,9.0,20.0,119.0,44.0,7.0,15.0,15,1
3,2010,2010-10-27T00:00:00.000Z,24363,0,13.0,4.0,30.0,9.0,4.0,86.0,40.0,25.0,20.0,9.0,26.0,104.0,39.0,7.0,14.0,1,15
4,2010,2010-10-27T00:00:00.000Z,24364,1,21.0,6.0,36.0,14.0,2.0,93.0,40.0,22.0,13.0,15.0,28.0,95.0,51.0,6.0,14.0,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18719,2017,2018-04-11T00:00:00.000Z,34959,1,22.0,1.0,26.0,24.0,8.0,92.0,37.0,19.0,16.0,12.0,17.0,98.0,38.0,9.0,17.0,19,27
18720,2017,2018-04-11T00:00:00.000Z,34960,1,11.0,6.0,32.0,46.0,13.0,82.0,31.0,12.0,8.0,8.0,24.0,83.0,40.0,2.0,14.0,26,11
18721,2017,2018-04-11T00:00:00.000Z,34960,0,22.0,3.0,42.0,26.0,7.0,80.0,38.0,20.0,13.0,6.0,14.0,96.0,48.0,6.0,11.0,11,26
18722,2017,2018-04-11T00:00:00.000Z,35631,0,26.0,7.0,37.0,26.0,11.0,106.0,46.0,20.0,13.0,19.0,18.0,116.0,56.0,8.0,9.0,28,16


In [942]:
# function to group all stats by game

def group_stats_by_game(df):
    #remove postseason games
    df = df[df['game.postseason'] == False]
    
    #list of columns to keep
    stats_columns_to_keep = ['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga',
       'fgm', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl', 'turnover']
    
    #keep only specified columns and sort by date
    df = df[stats_columns_to_keep].sort_values('game.date')
    
    # new field indicating whether team is at home
    df['home'] = np.where(df['team.id'] == df['game.home_team_id'], 1, 0)
    
    # group all stats by game/team
    df = df.groupby(['game.season','game.date','game.id','team.id','game.home_team_id','game.visitor_team_id','home']).sum().reset_index()
    
    # create column to indicate opponent id
    df['opp.id'] = np.where(df['team.id'] == df['game.home_team_id'], df['game.visitor_team_id'], df['game.home_team_id'])
    df.drop(['game.home_team_id','game.visitor_team_id'], axis=1,inplace=True)
    
    return df

In [943]:
#function that creates dataframe with opponent stats for each team
def opponent_stats(df):
    df['team.id_copy'] = df['team.id']
    df['home'] = np.where(df['home'] == 1, 0, 1)
    df.drop(['team.id'], axis=1,inplace=True)
    df.rename(columns={'opp.id':'team.id','team.id_copy':'opp.id','ast':'opp_ast', 'blk':'opp_blk',
       'dreb':'opp_dreb', 'fg3a':'opp_fg3a', 'fg3m':'opp_fg3m', 'fga':'opp_fga', 'fgm':'opp_fgm', 'fta':'opp_fta', 'ftm':'opp_ftm', 'oreb':'opp_oreb', 'pf':'opp_pf', 'pts':'opp_pts',
       'reb':'opp_reb', 'stl':'opp_stl', 'turnover':'opp_turnover'},inplace=True)

    return df

In [973]:
#function that creates a dataframe with opponent stats grouped by game
def prepare_stats_df(df):
    df_team = group_stats_by_game(df)
    df_opponent = opponent_stats(group_stats_by_game(df))
    #merge df and df_opponents
    df = pd.merge(df_team, df_opponent, how='outer', on=['game.season', 'game.date','game.id','team.id','opp.id','home'])
    return df

In [980]:
#creates columns with average team and opponent point totals
def create_average_scoring(df):
    #team stats
    # group by season and team and calculates a rolling mean of last x number of games with period of 1. Shift() to exclude the current row to only look at past rows
    df['score_avg_season'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['score_avg_l10'] = df.groupby(['game.season', 'team.id'])['pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pts'].transform(lambda x: x.rolling(8, 1).mean().shift())

    #opponent stats
    df['opp_score_avg_season'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # last 10 games
    df['opp_score_avg_l10'] = df.groupby(['game.season', 'team.id'])['opp_pts'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # average score for each team for the season by its home/away status(window size 41)
    df['opp_pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['opp_pts'].transform(lambda x: x.rolling(41, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['opp_score_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['opp_pts'].transform(lambda x: x.rolling(8, 1).mean().shift())
    
    return df

In [981]:
def create_point_diff(df):
    # calculate point differential for the current game
    df['pt_diff'] = df['pts'] - df['opp_pts']

    # calculate average point differential for each team for the season (window size 82)
    df['pt_diff_avg_season'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(82, 1).mean().shift())

    # calculate average point differential for each team for the season by its home/away status(window size 41)
    df['pt_diff_avg_home'] = df.groupby(['game.season', 'team.id','home'])['pt_diff'].transform(lambda x: x.rolling(41, 1).mean().shift())

    #avg diff over the last 10 games (same season)
    df['pt_diff_avg_l10'] = df.groupby(['game.season', 'team.id'])['pt_diff'].transform(lambda x: x.rolling(10, 1).mean().shift())

    # vs same opponent same team last 8 games (not limited by season)
    df['pt_diff_avg_h2hl8'] = df.groupby(['team.id','opp.id'])['pt_diff'].transform(lambda x: x.rolling(8, 1).mean().shift())
    df.drop(['pt_diff'], axis=1, inplace=True)
    
    return df

In [993]:
def create_efg_pct(df):

    #Effective field goal percentage = (fgm + (.5 * 3pm)) / fga
    df['fgm_total'] = df.groupby(['game.season', 'team.id'])['fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fg3m_total'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['fga_total'] = df.groupby(['game.season', 'team.id'])['fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['efg_pct'] = (df['fgm_total'] + (0.5 * df['fg3m_total'])) / df['fga_total']
    df.drop(['fgm_total','fg3m_total','fga_total'], axis=1, inplace=True)

    df['opp_fgm_total'] = df.groupby(['game.season', 'team.id'])['opp_fgm'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fg3m_total'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_fga_total'] = df.groupby(['game.season', 'team.id'])['opp_fga'].transform(lambda x: x.rolling(82,1).sum().shift())
    df['opp_efg_pct'] = (df['opp_fgm_total'] + (0.5 * df['opp_fg3m_total'])) / df['opp_fga_total']
    df.drop(['opp_fgm_total','opp_fg3m_total','opp_fga_total'], axis=1, inplace=True)
    
    return df


In [983]:
def create_avg_boxscr_stats(df):
    df['fg3m_avg'] = df.groupby(['game.season', 'team.id'])['fg3m'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_fg3m_avg'] = df.groupby(['game.season', 'team.id'])['opp_fg3m'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['ftm_avg'] = df.groupby(['game.season', 'team.id'])['ftm'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_ftm_avg'] = df.groupby(['game.season', 'team.id'])['opp_ftm'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['ast_avg'] = df.groupby(['game.season', 'team.id'])['ast'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_ast_avg'] = df.groupby(['game.season', 'team.id'])['opp_ast'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['turnover_avg'] = df.groupby(['game.season', 'team.id'])['turnover'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_turnover_avg'] = df.groupby(['game.season', 'team.id'])['opp_turnover'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['reb_avg'] = df.groupby(['game.season', 'team.id'])['reb'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_reb'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['oreb_avg'] = df.groupby(['game.season', 'team.id'])['oreb'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_reb_avg'] = df.groupby(['game.season', 'team.id'])['opp_oreb'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['stl_avg'] = df.groupby(['game.season', 'team.id'])['stl'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_stl_avg'] = df.groupby(['game.season', 'team.id'])['opp_stl'].transform(lambda x: x.rolling(82,1).mean().shift())

    df['blk_avg'] = df.groupby(['game.season', 'team.id'])['blk'].transform(lambda x: x.rolling(82,1).mean().shift())
    df['opp_blk_avg'] = df.groupby(['game.season', 'team.id'])['opp_blk'].transform(lambda x: x.rolling(82,1).mean().shift())

    return df

In [996]:
def feat_engineering(df):
    df = create_average_scoring(df)
    df = create_point_diff(df)
    df = create_efg_pct(df)
    df = create_avg_boxscr_stats(df)
    
    #drop some of the raw non-aggregated stats
    df.drop(['ast', 'blk', 'dreb', 'fg3a', 'fg3m', 'fga', 'fgm', 'fta', 'ftm', 'oreb', 'pf', 'reb', 'stl', 'turnover',
               'opp_ast', 'opp_blk', 'opp_dreb', 'opp.id','opp_fg3a', 'opp_fg3m', 'opp_fga', 'opp_fgm', 'opp_fta', 'opp_ftm', 'opp_oreb', 'opp_pf', 'opp_reb', 'opp_stl', 'opp_turnover','opp_pts'],
              axis=1, inplace=True)
    return df

In [1000]:
# puts the stats for each team in a game onto the same row
def combine_games(df):
    df_home = df[df['home'] == 1]
    df_away = df[df['home'] == 0]
    df = pd.merge(df_home, df_away, how='outer', on=['game.season', 'game.date','game.id'])
    return df

In [997]:
test = prepare_stats_df(df_stats)

In [998]:
test = feat_engineering(test)

In [1001]:
test = combine_games(test)
test

Unnamed: 0,game.season,game.date,game.id,team.id_x,home_x,pts_x,score_avg_season_x,score_avg_l10_x,pt_diff_avg_home_x,score_avg_h2hl8_x,opp_score_avg_season_x,opp_score_avg_l10_x,opp_pt_diff_avg_home_x,opp_score_avg_h2hl8_x,pt_diff_avg_season_x,pt_diff_avg_l10_x,pt_diff_avg_h2hl8_x,efg_pct_x,opp_efg_pct_x,fg3m_avg_x,opp_fg3m_avg_x,ftm_avg_x,opp_ftm_avg_x,ast_avg_x,opp_ast_avg_x,turnover_avg_x,opp_turnover_avg_x,reb_avg_x,opp_reb_avg_x,oreb_avg_x,stl_avg_x,opp_stl_avg_x,blk_avg_x,opp_blk_avg_x,team.id_y,home_y,pts_y,score_avg_season_y,score_avg_l10_y,pt_diff_avg_home_y,score_avg_h2hl8_y,opp_score_avg_season_y,opp_score_avg_l10_y,opp_pt_diff_avg_home_y,opp_score_avg_h2hl8_y,pt_diff_avg_season_y,pt_diff_avg_l10_y,pt_diff_avg_h2hl8_y,efg_pct_y,opp_efg_pct_y,fg3m_avg_y,opp_fg3m_avg_y,ftm_avg_y,opp_ftm_avg_y,ast_avg_y,opp_ast_avg_y,turnover_avg_y,opp_turnover_avg_y,reb_avg_y,opp_reb_avg_y,oreb_avg_y,stl_avg_y,opp_stl_avg_y,blk_avg_y,opp_blk_avg_y
0,2010,2010-10-27T00:00:00.000Z,24362,6,1,95.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,87.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2010,2010-10-27T00:00:00.000Z,24363,15,1,104.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,119.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2010,2010-10-27T00:00:00.000Z,24364,21,1,106.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,0,95.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2010,2010-10-27T00:00:00.000Z,24365,27,1,122.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,0,109.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2010,2010-10-27T00:00:00.000Z,24442,8,1,110.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29,0,88.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9357,2017,2018-04-11T00:00:00.000Z,34542,18,1,112.0,116.405063,107.0,6.153846,104.375,114.253165,105.1,111.153846,102.625,2.151899,1.9,1.750,0.524314,0.539683,8.594937,11.645570,20.594937,16.987342,24.151899,25.683544,12.607595,15.443038,44.544304,10.455696,10.974684,8.822785,6.873418,4.493671,4.759494,8,0,106.0,115.275000,114.0,-3.051282,102.625,113.675000,108.6,116.897436,104.375,1.600000,5.4,-1.750,0.535173,0.539584,11.937500,11.575000,18.087500,15.700000,26.300000,25.887500,15.087500,14.312500,46.712500,10.237500,11.625000,8.175000,8.350000,5.187500,5.000000
9358,2017,2018-04-11T00:00:00.000Z,34958,6,1,98.0,114.000000,112.7,1.512821,109.500,112.587500,106.3,113.512821,100.375,1.412500,6.4,9.125,0.550873,0.540174,12.425000,11.975000,18.575000,14.937500,24.100000,26.500000,13.462500,13.200000,42.925000,10.137500,8.425000,7.300000,7.525000,3.925000,4.275000,20,0,110.0,108.300000,104.7,-8.974359,100.375,111.950000,112.7,113.512821,109.500,-3.650000,-8.0,-9.125,0.508654,0.523679,8.500000,11.737500,15.725000,18.512500,23.887500,24.862500,14.525000,13.175000,45.850000,10.362500,10.962500,6.912500,8.337500,5.325000,5.012500
9359,2017,2018-04-11T00:00:00.000Z,34959,19,1,122.0,117.141026,110.8,0.512821,102.000,116.115385,107.2,120.487179,101.875,1.025641,3.6,0.125,0.540434,0.514722,10.756410,11.397436,16.897436,17.051282,28.589744,24.974359,15.230769,14.448718,46.371795,11.153846,8.961538,8.333333,8.692308,6.064103,4.217949,27,0,98.0,108.000000,107.0,-0.794872,101.875,104.632911,103.9,108.948718,102.000,3.367089,3.1,-0.125,0.508563,0.505156,9.037975,9.594937,17.037975,15.341772,23.898734,22.556962,13.392405,14.101266,46.556962,9.658228,10.873418,7.974684,7.962025,5.987342,3.974684
9360,2017,2018-04-11T00:00:00.000Z,34960,26,1,96.0,100.265823,94.1,-5.948718,100.625,107.784810,99.2,109.487179,115.250,-7.518987,-5.1,-14.625,0.502250,0.545117,9.113924,12.063291,12.670886,16.632911,21.924051,24.202532,13.329114,13.822785,41.025316,9.215190,9.493671,7.987342,7.898734,4.240506,4.759494,11,0,83.0,120.101266,106.0,8.736842,115.250,110.518987,98.0,105.000000,100.625,9.582278,8.0,14.625,0.554436,0.520245,16.481013,10.860759,21.215190,15.860759,23.063291,23.594937,14.215190,14.556962,46.544304,9.253165,9.607595,9.012658,8.101266,5.101266,4.645570


In [666]:
df = pd.merge(df_home, df_away, how='outer', on=['game.season', 'game.date','game.id'])

In [667]:
# winner column. 1 is when home team wins
df['winner'] = np.where(df['pts_x'] > df['pts_y'], 1, 0)

In [675]:
df.columns

Index(['game.season', 'game.date', 'game.id', 'team.id_x', 'home_x', 'pts_x',
       'opp.id_x', 'score_avg_season_x', 'score_avg_l10_x',
       'pt_diff_avg_home_x', 'score_avg_h2hl8_x', 'opp_score_avg_season_x',
       'opp_score_avg_l10_x', 'opp_pt_diff_avg_home_x',
       'opp_score_avg_h2hl8_x', 'pt_diff_avg_season_x', 'pt_diff_avg_l10_x',
       'pt_diff_avg_h2hl8_x', 'efg_pct_x', 'opp_efg_pct_x', 'fg3m_avg_x',
       'opp_fg3m_avg_x', 'ftm_avg_x', 'opp_ftm_avg_x', 'ast_avg_x',
       'opp_ast_avg_x', 'turnover_avg_x', 'opp_turnover_avg_x', 'reb_avg_x',
       'opp_reb_avg_x', 'oreb_avg_x', 'stl_avg_x', 'opp_stl_avg_x',
       'blk_avg_x', 'opp_blk_avg_x', 'team.id_y', 'home_y', 'pts_y',
       'opp.id_y', 'score_avg_season_y', 'score_avg_l10_y',
       'pt_diff_avg_home_y', 'score_avg_h2hl8_y', 'opp_score_avg_season_y',
       'opp_score_avg_l10_y', 'opp_pt_diff_avg_home_y',
       'opp_score_avg_h2hl8_y', 'pt_diff_avg_season_y', 'pt_diff_avg_l10_y',
       'pt_diff_avg_h2hl8_y'

In [669]:
features_columns = ['opp_ast_x', 'opp_blk_x', 'opp_dreb_x', 'opp_fg3a_x',
       'opp_fg3m_x', 'opp_fga_x', 'opp_fgm_x', 'opp_fta_x', 'opp_ftm_x',
       'opp_oreb_x', 'opp_pf_x', 'opp_reb_x', 'opp_stl_x',
       'opp_turnover_x', 'score_avg_season_x', 'score_avg_l10_x',
       'pt_diff_avg_home_x', 'score_avg_h2hl8_x', 'opp_score_avg_season_x',
       'opp_score_avg_l10_x', 'opp_pt_diff_avg_home_x',
       'opp_score_avg_h2hl8_x', 'pt_diff_avg_season_x', 'pt_diff_avg_l10_x',
       'pt_diff_avg_h2hl8_x', 'efg_pct_x', 'opp_efg_pct_x', 'fg3m_avg_x',
       'opp_fg3m_avg_x', 'ftm_avg_x', 'opp_ftm_avg_x', 'ast_avg_x',
       'opp_ast_avg_x', 'turnover_avg_x', 'opp_turnover_avg_x', 'reb_avg_x',
       'opp_reb_avg_x', 'oreb_avg_x', 'stl_avg_x', 'opp_stl_avg_x',
       'blk_avg_x', 'opp_blk_avg_x', 'opp_ast_y', 'opp_blk_y', 'opp_dreb_y', 'opp_fg3a_y',
       'opp_fg3m_y', 'opp_fga_y', 'opp_fgm_y', 'opp_fta_y', 'opp_ftm_y',
       'opp_oreb_y', 'opp_pf_y', 'opp_reb_y', 'opp_stl_y',
       'opp_turnover_y', 'score_avg_season_y', 'score_avg_l10_y',
       'pt_diff_avg_home_y', 'score_avg_h2hl8_y', 'opp_score_avg_season_y',
       'opp_score_avg_l10_y', 'opp_pt_diff_avg_home_y',
       'opp_score_avg_h2hl8_y', 'pt_diff_avg_season_y', 'pt_diff_avg_l10_y',
       'pt_diff_avg_h2hl8_y', 'efg_pct_y', 'opp_efg_pct_y', 'fg3m_avg_y',
       'opp_fg3m_avg_y', 'ftm_avg_y', 'opp_ftm_avg_y', 'ast_avg_y',
       'opp_ast_avg_y', 'turnover_avg_y', 'opp_turnover_avg_y', 'reb_avg_y',
       'opp_reb_avg_y', 'oreb_avg_y', 'stl_avg_y', 'opp_stl_avg_y',
       'blk_avg_y', 'opp_blk_avg_y']

## Feature Selection

In [892]:
df_X = df.copy()
df_X.dropna(axis=0, inplace=True)

# create X, y
y = df_X['winner']
df_X = df_X.filter(features_columns)

In [895]:
df_X

Unnamed: 0,score_avg_season_x,score_avg_l10_x,pt_diff_avg_home_x,score_avg_h2hl8_x,opp_score_avg_season_x,opp_score_avg_l10_x,opp_pt_diff_avg_home_x,opp_score_avg_h2hl8_x,pt_diff_avg_season_x,pt_diff_avg_l10_x,pt_diff_avg_h2hl8_x,efg_pct_x,opp_efg_pct_x,fg3m_avg_x,opp_fg3m_avg_x,ftm_avg_x,opp_ftm_avg_x,ast_avg_x,opp_ast_avg_x,turnover_avg_x,opp_turnover_avg_x,reb_avg_x,opp_reb_avg_x,oreb_avg_x,stl_avg_x,opp_stl_avg_x,blk_avg_x,opp_blk_avg_x,score_avg_season_y,score_avg_l10_y,pt_diff_avg_home_y,score_avg_h2hl8_y,opp_score_avg_season_y,opp_score_avg_l10_y,opp_pt_diff_avg_home_y,opp_score_avg_h2hl8_y,pt_diff_avg_season_y,pt_diff_avg_l10_y,pt_diff_avg_h2hl8_y,efg_pct_y,opp_efg_pct_y,fg3m_avg_y,opp_fg3m_avg_y,ftm_avg_y,opp_ftm_avg_y,ast_avg_y,opp_ast_avg_y,turnover_avg_y,opp_turnover_avg_y,reb_avg_y,opp_reb_avg_y,oreb_avg_y,stl_avg_y,opp_stl_avg_y,blk_avg_y,opp_blk_avg_y
54,150.666667,150.666667,64.000000,101.000,122.666667,122.666667,194.000000,78.000,28.000000,28.000000,23.000,0.587459,0.423193,11.333333,6.333333,32.000000,29.000000,36.666667,23.333333,16.333333,21.333333,56.666667,18.333333,9.666667,11.333333,11.333333,8.333333,1.666667,83.666667,83.666667,-15.000000,78.000,97.000000,97.0,105.000000,101.000,-13.333333,-13.333333,-23.000,0.438525,0.530952,6.000000,7.000000,12.333333,22.666667,19.000000,22.000000,13.666667,13.666667,36.666667,9.666667,11.333333,5.333333,7.333333,3.333333,4.000000
70,89.000000,89.000000,-15.000000,118.000,114.400000,114.400000,119.000000,123.000,-25.400000,-25.400000,-5.000,0.505540,0.520642,4.200000,10.800000,16.000000,23.600000,15.400000,21.400000,15.200000,20.200000,34.800000,16.800000,10.000000,8.400000,9.400000,3.800000,6.600000,114.250000,114.250000,4.000000,123.000,114.500000,114.5,114.000000,118.000,-0.250000,-0.250000,5.000,0.528610,0.551282,10.500000,7.750000,17.250000,17.750000,23.000000,23.250000,16.250000,18.250000,43.250000,13.250000,15.000000,8.250000,9.500000,4.250000,4.500000
83,101.666667,101.666667,-2.333333,93.000,100.500000,100.500000,98.000000,91.000,1.166667,1.166667,2.000,0.509494,0.510309,6.166667,6.166667,21.166667,18.000000,21.500000,24.000000,13.000000,13.333333,39.666667,10.833333,9.666667,5.000000,7.500000,2.500000,5.500000,86.200000,86.200000,-13.500000,91.000,97.000000,97.0,103.000000,93.000,-10.800000,-10.800000,-2.000,0.443902,0.521858,6.400000,7.200000,13.400000,20.600000,18.400000,22.400000,13.200000,12.200000,39.200000,9.400000,11.600000,5.200000,6.200000,4.000000,4.200000
97,114.666667,114.666667,-3.250000,107.000,121.666667,121.666667,102.500000,106.000,-7.000000,-7.000000,1.000,0.446208,0.521368,4.333333,6.333333,30.333333,20.000000,17.833333,28.500000,16.333333,15.000000,49.500000,12.000000,12.833333,8.500000,7.833333,5.500000,4.500000,91.833333,91.833333,-19.666667,106.000,96.000000,96.0,102.333333,107.000,-4.166667,-4.166667,-1.000,0.481441,0.500000,4.500000,4.333333,18.333333,18.666667,21.500000,19.166667,12.833333,16.166667,35.833333,12.000000,11.166667,8.333333,5.666667,3.833333,3.833333
103,87.666667,87.666667,-9.000000,90.000,95.333333,95.333333,93.000000,105.000,-7.666667,-7.666667,-15.000,0.447154,0.511287,6.666667,6.500000,14.333333,19.833333,18.833333,22.166667,12.333333,12.000000,40.666667,9.000000,12.166667,5.000000,5.666667,4.166667,4.000000,102.833333,102.833333,3.000000,105.000,94.500000,94.5,88.000000,90.000,8.333333,8.333333,15.000,0.528926,0.489429,10.666667,6.166667,17.500000,17.333333,21.166667,19.166667,14.000000,14.666667,43.500000,8.333333,12.166667,6.833333,6.333333,6.000000,3.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9357,116.405063,107.000000,6.153846,104.375,114.253165,105.100000,111.153846,102.625,2.151899,1.900000,1.750,0.524314,0.539683,8.594937,11.645570,20.594937,16.987342,24.151899,25.683544,12.607595,15.443038,44.544304,10.455696,10.974684,8.822785,6.873418,4.493671,4.759494,115.275000,114.000000,-3.051282,102.625,113.675000,108.6,116.897436,104.375,1.600000,5.400000,-1.750,0.535173,0.539584,11.937500,11.575000,18.087500,15.700000,26.300000,25.887500,15.087500,14.312500,46.712500,10.237500,11.625000,8.175000,8.350000,5.187500,5.000000
9358,114.000000,112.700000,1.512821,109.500,112.587500,106.300000,113.512821,100.375,1.412500,6.400000,9.125,0.550873,0.540174,12.425000,11.975000,18.575000,14.937500,24.100000,26.500000,13.462500,13.200000,42.925000,10.137500,8.425000,7.300000,7.525000,3.925000,4.275000,108.300000,104.700000,-8.974359,100.375,111.950000,112.7,113.512821,109.500,-3.650000,-8.000000,-9.125,0.508654,0.523679,8.500000,11.737500,15.725000,18.512500,23.887500,24.862500,14.525000,13.175000,45.850000,10.362500,10.962500,6.912500,8.337500,5.325000,5.012500
9359,117.141026,110.800000,0.512821,102.000,116.115385,107.200000,120.487179,101.875,1.025641,3.600000,0.125,0.540434,0.514722,10.756410,11.397436,16.897436,17.051282,28.589744,24.974359,15.230769,14.448718,46.371795,11.153846,8.961538,8.333333,8.692308,6.064103,4.217949,108.000000,107.000000,-0.794872,101.875,104.632911,103.9,108.948718,102.000,3.367089,3.100000,-0.125,0.508563,0.505156,9.037975,9.594937,17.037975,15.341772,23.898734,22.556962,13.392405,14.101266,46.556962,9.658228,10.873418,7.974684,7.962025,5.987342,3.974684
9360,100.265823,94.100000,-5.948718,100.625,107.784810,99.200000,109.487179,115.250,-7.518987,-5.100000,-14.625,0.502250,0.545117,9.113924,12.063291,12.670886,16.632911,21.924051,24.202532,13.329114,13.822785,41.025316,9.215190,9.493671,7.987342,7.898734,4.240506,4.759494,120.101266,106.000000,8.736842,115.250,110.518987,98.0,105.000000,100.625,9.582278,8.000000,14.625,0.554436,0.520245,16.481013,10.860759,21.215190,15.860759,23.063291,23.594937,14.215190,14.556962,46.544304,9.253165,9.607595,9.012658,8.101266,5.101266,4.645570


In [896]:
from sklearn.feature_selection import VarianceThreshold
# remove features with small variance
vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_X)
selected_columns = df_X.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_X = pd.DataFrame(df_transformed, columns = selected_columns)

In [897]:
# remove highly correlated pairs
df_corr = df_X.corr().abs()
indices = np.where(df_corr > 0.9)
indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices)
              if x != y and x < y]
for idx in indices:
    try:
        df_X.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

In [878]:
# forward regression
from sklearn.feature_selection import f_regression, SelectKBest
skb = SelectKBest(f_regression, k=24)
X = skb.fit_transform(df_X, y)
X = pd.DataFrame(X,columns=df_X.columns[skb.get_support()])
X

Unnamed: 0,score_avg_season_x,score_avg_l10_x,pt_diff_avg_home_x,opp_score_avg_h2hl8_x,pt_diff_avg_season_x,pt_diff_avg_l10_x,pt_diff_avg_h2hl8_x,fg3m_avg_x,ftm_avg_x,ast_avg_x,opp_ast_avg_x,stl_avg_x,blk_avg_x,opp_blk_avg_x,score_avg_season_y,score_avg_l10_y,pt_diff_avg_home_y,pt_diff_avg_season_y,pt_diff_avg_l10_y,fg3m_avg_y,ftm_avg_y,ast_avg_y,opp_ast_avg_y,opp_blk_avg_y
0,150.666667,150.666667,64.000000,78.000,28.000000,28.000000,23.000,11.333333,32.000000,36.666667,23.333333,11.333333,8.333333,1.666667,83.666667,83.666667,-15.000000,-13.333333,-13.333333,6.000000,12.333333,19.000000,22.000000,4.000000
1,89.000000,89.000000,-15.000000,123.000,-25.400000,-25.400000,-5.000,4.200000,16.000000,15.400000,21.400000,8.400000,3.800000,6.600000,114.250000,114.250000,4.000000,-0.250000,-0.250000,10.500000,17.250000,23.000000,23.250000,4.500000
2,101.666667,101.666667,-2.333333,91.000,1.166667,1.166667,2.000,6.166667,21.166667,21.500000,24.000000,5.000000,2.500000,5.500000,86.200000,86.200000,-13.500000,-10.800000,-10.800000,6.400000,13.400000,18.400000,22.400000,4.200000
3,114.666667,114.666667,-3.250000,106.000,-7.000000,-7.000000,1.000,4.333333,30.333333,17.833333,28.500000,8.500000,5.500000,4.500000,91.833333,91.833333,-19.666667,-4.166667,-4.166667,4.500000,18.333333,21.500000,19.166667,3.833333
4,87.666667,87.666667,-9.000000,105.000,-7.666667,-7.666667,-15.000,6.666667,14.333333,18.833333,22.166667,5.000000,4.166667,4.000000,102.833333,102.833333,3.000000,8.333333,8.333333,10.666667,17.500000,21.166667,19.166667,3.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8633,116.405063,107.000000,6.153846,102.625,2.151899,1.900000,1.750,8.594937,20.594937,24.151899,25.683544,8.822785,4.493671,4.759494,115.275000,114.000000,-3.051282,1.600000,5.400000,11.937500,18.087500,26.300000,25.887500,5.000000
8634,114.000000,112.700000,1.512821,100.375,1.412500,6.400000,9.125,12.425000,18.575000,24.100000,26.500000,7.300000,3.925000,4.275000,108.300000,104.700000,-8.974359,-3.650000,-8.000000,8.500000,15.725000,23.887500,24.862500,5.012500
8635,117.141026,110.800000,0.512821,101.875,1.025641,3.600000,0.125,10.756410,16.897436,28.589744,24.974359,8.333333,6.064103,4.217949,108.000000,107.000000,-0.794872,3.367089,3.100000,9.037975,17.037975,23.898734,22.556962,3.974684
8636,100.265823,94.100000,-5.948718,115.250,-7.518987,-5.100000,-14.625,9.113924,12.670886,21.924051,24.202532,7.987342,4.240506,4.759494,120.101266,106.000000,8.736842,9.582278,8.000000,16.481013,21.215190,23.063291,23.594937,4.645570


In [911]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(df_X)

## Train Model

In [912]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.80,random_state=1)

In [913]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

In [914]:
# Train the model using the training sets
model.fit(X_train,y_train)


GaussianNB()

In [915]:
y_pred=model.predict(X_test)

In [916]:
#print(f'MSE is: {mean_squared_error(y_test, y_pred)}')
#print(f'R2 score is: {r2_score(y_test, y_pred)}')
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6591435185185185
ROC AUC score is: 0.6513035019455252
[[427 273]
 [316 712]]


In [917]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=0)
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=7, random_state=0)

In [918]:
y_pred=clf.predict(X_test)

In [919]:
print(f'MSE is: {mean_squared_error(y_test, y_pred)}')
print(f'R2 score is: {r2_score(y_test, y_pred)}')
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

MSE is: 0.33217592592592593
R2 score is: -0.37836575875486367
Accuracy score is: 0.6678240740740741
ROC AUC score is: 0.6374041133963313
[[334 366]
 [208 820]]


In [920]:
from sklearn.svm import SVC

In [921]:
clf = SVC(kernel='rbf')
clf.fit(X_train,y_train)

SVC()

In [922]:
y_pred=clf.predict(X_test)

In [923]:
print(f'Accuracy score is: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC score is: {roc_auc_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy score is: 0.6608796296296297
ROC AUC score is: 0.6297443023902167
[[326 374]
 [212 816]]
