In [78]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

from tqdm import tqdm
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguegamelog

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

pd.options.display.max_columns=200

In [18]:
seasons = ["20{}-{}".format(x, x+1) for x in range(10, 21)]
print(seasons)
def gather_data(seasons = seasons):
    full_player_data = []
    for season in seasons:
        player_gls = pd.read_csv('./data/player_gamelogs_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_adv = pd.read_csv('./data/player_advanced_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_scoring = pd.read_csv('./data/player_scoring_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
#         player_gls_tracking = pd.read_csv('./data/player_tracking_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
    
        print("player_gls_shape:", player_gls.shape,
             "player_gls_adv_shape:", player_gls_adv.shape,
             "player_gls_scoring_shape:", player_gls_scoring.shape)
    
        player_gls = player_gls[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK',
       'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']]

        player_full = pd.merge(player_gls, player_gls_adv, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_scoring, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
    
        print("after_merging shape", player_full.shape)
        
        # drop unnecessary columns
        player_full = player_full.drop(columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ID_x',
                         'TEAM_ABBREVIATION_x','TEAM_CITY','PLAYER_NAME_x','START_POSITION',
                          'COMMENT','MIN_x','TEAM_ID_x', 'TEAM_ABBREVIATION_x','TEAM_CITY_x', 
                          'PLAYER_NAME_x', 'START_POSITION_x', 'COMMENT_x', 'MIN_x', 
                         'TEAM_ID_x', 'TEAM_ABBREVIATION_x', 'TEAM_CITY_x', 'PLAYER_NAME_x', 
                          'START_POSITION_x', 'COMMENT_x', 'MIN_x', 'AST_x', 'FG_PCT_x',
                         'FG_PCT', 'FG3_PCT', 'FT_PCT', 'E_NET_RATING', 'NET_RATING', 
                         'AST_PCT', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'EFG_PCT', 'TS_PCT', 
                         'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_FT', 'CFG_PCT',
                        'UFG_PCT', 'DFG_PCT'], 
                       errors= 'ignore')

        # Convert date to datetime
        player_full['GAME_DATE'] = pd.to_datetime(player_full['GAME_DATE'])
        
        # Convert W/L to a binary 1/0 for win/loss
        player_full['WL'] = (player_full['WL'].str[0] == 'W').astype(int)
        
        # Add a binary home_game feature
        player_full['HOME_GAME'] = 0 
        player_full['HOME_GAME'] = (player_full['MATCHUP'].str[:6] == player_full['TEAM_ABBREVIATION'].str[:] + ' vs').astype(int)
        
        # convert the percentages into raw numbers (because we will be averaging them, we don't want to average percentages)
        # for example, percentage of points from midrange will be points scored from midrange
        player_full['FG2M'] = player_full['FGM'] - player_full['FG3M']
        player_full['FG2A'] = player_full['FGA'] - player_full['FG3A']
        player_full['PTS_2PT'] = (player_full['PTS'] * player_full['PCT_PTS_2PT']).astype('int8')
        player_full['PTS_2PT_MR'] = (player_full['PTS'] * player_full['PCT_PTS_2PT_MR']).astype('int8')
        player_full['PTS_3PT'] = (player_full['PTS'] * player_full['PCT_PTS_3PT']).astype('int8')
        player_full['PTS_FB'] = (player_full['PTS'] * player_full['PCT_PTS_FB']).astype('int8')
        player_full['PTS_OFF_TOV'] = (player_full['PTS'] * player_full['PCT_PTS_OFF_TOV']).astype('int8')
        player_full['PTS_PAINT'] = (player_full['PTS'] * player_full['PCT_PTS_PAINT']).astype('int8')
        player_full['AST_2PM'] = (player_full['FG2M'] * player_full['PCT_AST_2PM']).astype('int8')
        player_full['UAST_2PM'] = (player_full['FG2M'] * player_full['PCT_UAST_2PM']).astype('int8')
        player_full['AST_3PM'] = (player_full['FG3M'] * player_full['PCT_AST_3PM']).astype('int8')
        player_full['UAST_3PM'] = (player_full['FG3M'] * player_full['PCT_UAST_3PM']).astype('int8')



        player_full = player_full.drop(columns = ['PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 
                                                  'PCT_PTS_FB', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT',
                                                  'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                                                  'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 
                                                  'FGM', 'FGA', 'AST_TOV', 'USG_PCT', 'PACE',
                                                  'PACE_PER40', ], errors='ignore')
        
        
        full_player_data.append(player_full)
        
        
    
    player_df = pd.concat(full_player_data)
    
    
    player_df['home_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 1 else row['MATCHUP'][-3:], axis=1)
    player_df['away_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 0 else row['MATCHUP'][-3:], axis=1)
    
    player_df[['home_team_abbr', 'away_team_abbr']] = player_df[['home_team_abbr', 'away_team_abbr']].replace({'NOH':'NOP',
                                                                                                               'NJN':'BKN'})
    
    # Reorder columns
    player_df = player_df[['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID', 
                           'home_team_abbr', 'away_team_abbr', 'GAME_ID', 
                           'GAME_DATE', 'MATCHUP', 'WL', 'HOME_GAME', 'MIN',
                           'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
                           'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD',
                           'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'E_DEF_RATING',
                           'AST_RATIO', 'TM_TOV_PCT', 'E_USG_PCT',
                           'E_PACE', 'POSS', 'PIE',
                           'FG2M', 'FG2A', 'PTS_2PT_MR', 'PTS_FB', 
                           'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
                           'UAST_2PM', 'AST_3PM', 'UAST_3PM']]
    
    return player_df

player_df = gather_data(seasons)

['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21']
player_gls_shape: (25153, 64) player_gls_adv_shape: (31419, 31) player_gls_scoring_shape: (29475, 24)
after_merging shape (25153, 83)
player_gls_shape: (20758, 64) player_gls_adv_shape: (27638, 31) player_gls_scoring_shape: (25455, 24)
after_merging shape (20758, 83)
player_gls_shape: (25757, 64) player_gls_adv_shape: (33542, 31) player_gls_scoring_shape: (31338, 24)
after_merging shape (25757, 83)
player_gls_shape: (25618, 64) player_gls_adv_shape: (33681, 31) player_gls_scoring_shape: (31378, 24)
after_merging shape (25618, 83)
player_gls_shape: (25981, 64) player_gls_adv_shape: (33522, 31) player_gls_scoring_shape: (31412, 24)
after_merging shape (25981, 83)
player_gls_shape: (26078, 64) player_gls_adv_shape: (33659, 31) player_gls_scoring_shape: (31423, 24)
after_merging shape (26078, 83)
player_gls_shape: (26139, 64) player_gls_adv_shape: (33610, 31) player_gls

In [88]:
player_df

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
0,2010-11,MIL,Corey Maggette,1894,OKC,MIL,21001225,2011-04-13,MIL @ OKC,1,0,18.776667,0,0,6,6,0,1,2,2,1,0,0,2,3,12,2,111.0,105.4,16.7,16.7,0.239,96.89,39,0.139,3,5,3,2,0,2,3,0,0,0
1,2010-11,NOH,Jarrett Jack,101127,DAL,NOP,21001223,2011-04-13,NOH @ DAL,0,0,28.658333,1,1,5,6,0,0,7,1,0,1,0,2,4,22,-22,100.3,137.0,28.0,4.0,0.296,95.91,56,0.144,7,14,6,2,0,8,2,4,1,0
2,2010-11,NJN,Jordan Farmar,200770,CHI,BKN,21001222,2011-04-13,NJN @ CHI,0,0,38.166667,2,6,5,6,0,4,12,4,0,2,0,1,4,21,7,107.9,102.8,35.3,11.8,0.274,91.28,71,0.198,5,9,3,1,6,6,0,5,1,1
3,2010-11,NYK,Amar'e Stoudemire,2405,BOS,NYK,21001216,2011-04-13,NYK @ BOS,0,0,20.450000,0,0,2,3,2,2,1,2,0,1,0,1,4,14,-3,107.6,107.5,5.3,10.5,0.384,101.49,40,0.072,6,15,4,0,0,7,1,4,0,0
4,2010-11,OKC,Byron Mullens,201957,OKC,MIL,21001225,2011-04-13,OKC vs. MIL,0,1,29.178333,0,0,2,4,1,4,0,2,1,0,1,2,2,10,-22,90.1,130.8,0.0,15.4,0.191,97.12,60,0.041,4,9,0,1,4,8,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8689,2020-21,GSW,Jordan Poole,1629673,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,17.916667,0,1,0,0,0,2,3,2,0,0,0,2,1,2,-3,83.8,94.8,33.3,22.2,0.135,103.63,38,0.000,1,3,0,0,0,2,0,1,0,0
8690,2020-21,GSW,Andrew Wiggins,203952,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,31.233333,2,6,3,4,0,2,1,4,0,1,2,4,4,13,-28,74.4,115.8,4.3,17.4,0.262,111.88,72,-0.044,2,10,2,0,0,2,1,1,2,0
8691,2020-21,GSW,Juan Toscano-Anderson,1629308,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,13.250000,0,0,0,0,0,4,2,0,0,2,0,3,1,4,-9,70.5,95.5,50.0,0.0,0.058,119.84,33,0.208,2,2,0,2,0,4,2,0,0,0
8692,2020-21,LAC,Luke Kennard,1628379,LAL,LAC,22000002,2020-12-22,LAC @ LAL,1,0,21.055000,0,3,0,0,0,2,2,1,2,1,0,3,0,4,-3,116.9,112.2,22.2,11.1,0.144,104.59,44,0.032,2,3,0,0,0,4,1,1,0,0


In [292]:
def aggregate_player_stats(df = player_df):

    seasons = player_df['SEASON_YEAR'].unique()

    df = df.sort_values('GAME_DATE')

    for season in seasons:
        season_df = df.loc[df['SEASON_YEAR'] == season]
        player_ids = season_df['PLAYER_ID'].unique()
        for p_id in tqdm(player_ids, desc='Progress'):
            player_log = season_df.loc[season_df['PLAYER_ID'] == p_id]

            avg_player_data = player_log.iloc[:, 11:].shift(1).rolling(10, min_periods=5).mean()

            cols = avg_player_data.columns

            df.loc[(df['SEASON_YEAR'] == season) & (df['PLAYER_ID'] == p_id), cols] = avg_player_data

    df.to_csv('player_avg_last10.csv', index=False)
        
    return df
    
player_stats = aggregate_player_stats(df=player_df)

Progress: 100%|██████████████████████████████████████████████████████████████████████| 452/452 [00:47<00:00,  9.52it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 478/478 [00:54<00:00,  8.72it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 469/469 [01:01<00:00,  7.61it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 482/482 [00:53<00:00,  8.97it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 492/492 [00:52<00:00,  9.34it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 476/476 [00:52<00:00,  9.00it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 486/486 [00:53<00:00,  9.07it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 540/540 [00:55<00:00,  9.65it/s]
Progress: 100%|█████████████████████████

In [293]:
player_stats.loc[player_stats['GAME_ID']==21000075].sort_values('MIN', ascending=False)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
23603,2010-11,MIA,LeBron James,2544,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,34.323333,1.0,3.4,6.2,8.0,0.0,4.6,7.2,5.2,1.4,0.8,0.0,1.8,4.4,20.4,14.0,109.96,83.66,24.92,16.64,0.3224,94.648,66.8,0.1922,5.6,10.6,3.8,4.6,4.8,7.2,1.8,3.4,1.0,0.0
23527,2010-11,MIA,Chris Bosh,2547,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.736667,0.0,0.2,3.8,4.6,1.0,5.4,1.6,0.8,1.0,1.0,0.8,2.2,2.8,13.0,15.0,117.26,87.02,10.52,4.7,0.2164,92.952,60.6,0.1264,4.6,10.6,4.0,0.0,2.0,4.6,2.6,1.6,0.0,0.0
23563,2010-11,MIA,Dwyane Wade,2548,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.570667,1.0,2.6,5.8,8.0,1.2,3.8,4.2,3.0,1.6,1.0,1.0,2.8,5.4,22.4,10.8,113.96,90.06,15.52,10.78,0.3498,93.192,60.0,0.1774,6.8,13.8,1.6,4.8,5.6,11.6,1.8,4.2,0.2,0.8
23730,2010-11,MIA,James Jones,2592,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,27.129,3.6,6.8,0.0,0.0,0.2,2.2,0.8,0.0,0.6,0.4,0.0,2.0,1.2,11.6,6.2,105.4,91.18,10.04,0.0,0.1364,94.588,53.6,0.1214,0.4,1.0,0.8,1.0,2.4,0.0,0.4,0.0,3.6,0.0
23685,2010-11,MIA,Carlos Arroyo,2306,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,23.359667,0.4,0.8,1.2,1.4,0.4,3.2,2.4,1.0,0.6,0.0,0.0,1.4,0.8,7.2,13.4,111.9,82.1,23.26,9.62,0.1394,91.746,43.8,0.1214,2.4,4.6,4.2,0.2,0.8,0.2,1.4,0.6,0.4,0.0
23585,2010-11,MIA,Udonis Haslem,2617,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.872,0.0,0.0,1.6,2.0,1.8,7.0,0.2,0.8,0.4,0.4,0.4,3.6,2.4,7.2,2.8,98.2,91.04,2.22,12.52,0.151,96.178,45.8,0.1274,2.8,5.8,3.6,0.8,1.0,1.6,2.4,0.4,0.0,0.0
23712,2010-11,MIA,Eddie House,2067,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.689667,2.0,3.2,1.0,1.2,0.0,1.8,1.8,0.8,1.4,0.0,0.2,1.4,0.6,9.0,-0.2,98.72,98.4,19.54,7.78,0.1596,93.754,44.4,0.1304,1.0,3.4,1.8,1.0,0.6,0.0,0.6,0.4,1.8,0.2
23518,2010-11,MIA,Joel Anthony,201202,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,20.329333,0.0,0.0,0.6,1.2,1.2,3.2,0.6,0.4,0.4,1.4,0.0,3.4,1.0,1.4,7.8,101.32,76.84,20.0,30.0,0.0406,91.318,37.6,0.054,0.4,0.6,0.0,0.4,0.2,0.8,0.2,0.2,0.0,0.0
23728,2010-11,MIA,Zydrunas Ilgauskas,980,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,14.649667,0.0,0.0,0.8,1.2,2.2,2.0,0.6,0.6,0.6,0.8,0.0,3.6,1.0,6.0,10.2,127.04,85.52,7.3,15.56,0.19,97.444,29.8,0.0874,2.6,4.8,3.6,0.0,0.6,1.4,2.0,0.4,0.0,0.0
23581,2010-11,NOH,Chris Paul,101108,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [294]:
# Drop Null Rows
player_stats = player_stats.dropna(axis=0)

In [390]:
player_stats = player_stats.sort_values('GAME_DATE')

cutoff = int(player_stats['GAME_ID'].nunique()*0.8)
train_ids = player_stats['GAME_ID'].unique()[:cutoff]
test_ids = player_stats['GAME_ID'].unique()[cutoff:]
test_ids

array([21800251, 21800249, 21800253, ..., 22000424, 22000426, 22000425])

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
20709,2018-19,NYK,Mitchell Robinson,1629011,NYK,POR,21800251,2018-11-20,NYK vs. POR,0,1,20.535833,0.0,0.000000,0.900000,1.400000,2.200000,2.6,0.500000,0.400000,0.9,2.400000,0.400000,3.500000,1.000000,4.900000,-2.300000,102.670000,109.730000,5.840000,8.110000,0.094200,102.380000,43.600000,0.041500,2.0,3.600000,0.200000,0.200000,0.9,3.600000,1.200000,0.7,0.0,0.0
20716,2018-19,WAS,Thomas Bryant,1628418,WAS,LAC,21800249,2018-11-20,WAS vs. LAC,1,1,5.291667,0.0,0.666667,1.166667,1.333333,0.666667,1.0,0.666667,0.333333,0.0,0.166667,0.333333,0.666667,0.833333,4.166667,-0.666667,95.066667,102.616667,10.183333,18.516667,0.255833,106.966667,12.666667,0.486167,1.5,2.166667,0.333333,0.166667,0.5,2.333333,1.333333,0.0,0.0,0.0
20710,2018-19,WAS,John Wall,202322,WAS,LAC,21800249,2018-11-20,WAS vs. LAC,1,1,35.409667,2.1,4.800000,3.000000,4.600000,0.500000,3.0,8.300000,3.600000,1.9,0.900000,1.100000,2.300000,3.800000,21.100000,-4.900000,105.090000,110.230000,26.870000,11.850000,0.274200,104.644000,75.600000,0.132000,5.9,12.200000,1.800000,2.100000,3.9,9.400000,0.400000,5.1,1.6,0.3
20714,2018-19,NYK,Tim Hardaway Jr.,203501,NYK,POR,21800251,2018-11-20,NYK vs. POR,0,1,32.392167,3.1,8.600000,6.200000,7.700000,0.500000,3.5,2.700000,2.200000,1.2,0.200000,1.400000,2.500000,5.000000,23.700000,-6.100000,105.960000,115.700000,10.810000,8.880000,0.288400,103.145000,68.300000,0.111500,4.1,8.500000,3.100000,1.900000,4.0,4.300000,0.400000,3.2,2.3,0.6
20717,2018-19,WAS,Tomas Satoransky,203107,WAS,LAC,21800249,2018-11-20,WAS vs. LAC,1,1,11.343500,0.3,0.800000,0.500000,0.600000,0.300000,1.0,2.200000,0.300000,0.6,0.200000,0.200000,1.200000,0.500000,3.800000,0.800000,103.040000,111.810000,25.380000,5.670000,0.132500,100.710000,24.000000,0.101500,1.2,2.200000,0.400000,0.200000,0.6,2.000000,0.600000,0.5,0.3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,2020-21,HOU,Ben McLemore,203463,WAS,HOU,22000421,2021-02-15,HOU @ WAS,0,0,9.688167,0.7,2.900000,0.900000,1.100000,0.200000,0.9,0.600000,0.600000,0.2,0.000000,0.100000,0.900000,0.400000,3.400000,-2.900000,92.930000,112.540000,7.520000,11.070000,0.185900,103.524000,21.400000,0.009800,0.2,0.400000,0.000000,0.900000,0.6,0.300000,0.200000,0.0,0.7,0.0
99,2020-21,UTA,Joe Ingles,204060,UTA,PHI,22000423,2021-02-15,UTA vs. PHI,1,1,29.227000,2.8,5.900000,2.000000,2.200000,0.500000,3.3,5.900000,1.800000,0.4,0.100000,0.400000,1.600000,2.100000,12.800000,7.700000,118.550000,106.390000,35.330000,10.850000,0.163400,100.949000,60.600000,0.119400,1.2,2.300000,0.000000,1.600000,0.4,2.200000,0.600000,0.5,1.9,0.7
119,2020-21,SAC,Glenn Robinson III,203922,SAC,BKN,22000426,2021-02-15,SAC vs. BKN,0,1,12.590000,0.6,1.800000,0.400000,0.500000,0.200000,0.9,0.800000,0.400000,0.3,0.200000,0.400000,1.000000,0.400000,3.800000,-2.200000,108.610000,112.650000,15.510000,3.710000,0.136400,102.622000,27.100000,0.042800,0.8,2.100000,0.200000,0.200000,0.5,1.400000,0.400000,0.4,0.6,0.0
92,2020-21,UTA,Miye Oni,1629671,UTA,PHI,22000423,2021-02-15,UTA vs. PHI,1,1,6.903000,0.2,0.800000,0.400000,0.400000,0.800000,1.4,0.100000,0.500000,0.2,0.000000,0.100000,1.100000,0.300000,1.000000,0.900000,97.970000,100.030000,3.330000,29.760000,0.133300,100.088000,15.200000,-0.018100,0.0,0.100000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.2,0.0


In [632]:

scaler = StandardScaler()

standardized_stats = player_stats
standardized_stats.iloc[:, 11:] = scaler.fit_transform(standardized_stats.iloc[:, 11:])

info, features_standardized, targets = prep_data_for_model(df=standardized_stats)

progress: 100%|█████████████████████████████████████████████████████████████████| 11515/11515 [00:58<00:00, 198.33it/s]


In [607]:
test = player_stats.loc[player_stats['GAME_ID'] ==22000421]
home = test.loc[test['HOME_GAME'] == 1]
away = test.loc[test['HOME_GAME'] == 0]
# home[['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE']].iloc[0].values


In [567]:
def prep_data_for_model(df = player_stats):
    matchup_info = []
    matchups = []
    targets = []
    
    all_team_gamelogs = pd.read_csv('./data/team_gamelogs_2000_present.csv')

    game_results = all_team_gamelogs.loc[all_team_gamelogs['MATCHUP'].str.contains('vs'), ['GAME_ID', 'PLUS_MINUS']]
    game_results

    for game_id in tqdm(df['GAME_ID'].unique(), desc="progress"):
        game_df = df.loc[df['GAME_ID'] == game_id]
        


        home = game_df.loc[game_df['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
        home_stats = home.iloc[:10, 11:].values
        
        away = game_df.loc[game_df['HOME_GAME'] == 0].sort_values('MIN', ascending=False)
        away_stats = away.iloc[:10, 11:].values
                
        # If a team had one of their game's removed (because it was in the first 5 games of the season), remove the entire game
        if (home_stats.shape[0] == 0) or (away_stats.shape[0] == 0):
            continue
        
        if home_stats.shape[0] < 10:
            missing_rows = 10 - home_stats.shape[0]
            padding = np.zeros((missing_rows, home_stats.shape[1]))
            home_stats = np.concatenate([home_stats, padding], axis=0)
        
        if away_stats.shape[0] < 10:
            missing_rows = 10 - away_stats.shape[0]
            padding = np.zeros((missing_rows, away_stats.shape[1]))
            away_stats = np.concatenate([away_stats, padding], axis=0)

        matchup = np.stack([home_stats, away_stats], axis=0)
        matchups.append(matchup)

        point_diff = game_results.loc[game_results['GAME_ID'] == game_id, 'PLUS_MINUS'].values[0]
        targets.append(point_diff)
        
        matchup_info.append(home[['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE']].iloc[0].values)
        
    matchup_features = np.stack(matchups, axis=0)
    matchup_info = pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE'])
    
    return matchup_info, matchup_features, np.array(targets)

matchup_info, matchup_features, targets = prep_data_for_model(df = player_stats)

progress: 100%|█████████████████████████████████████████████████████████████████| 11515/11515 [00:55<00:00, 208.86it/s]


In [568]:
matchup_info.shape, matchup_features.shape, targets.shape

((11423, 5), (11423, 2, 10, 34), (11423,))

In [565]:
pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE'])

Unnamed: 0,SEASON_YEAR,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE
0,2010-11,MIN,ATL,21000074,2010-11-05
1,2010-11,MIL,NOP,21000083,2010-11-06
2,2010-11,UTA,LAC,21000086,2010-11-06
3,2010-11,SAC,MEM,21000088,2010-11-06
4,2010-11,POR,TOR,21000087,2010-11-06
5,2010-11,MIA,BKN,21000082,2010-11-06
6,2010-11,HOU,MIN,21000092,2010-11-07
7,2010-11,LAL,POR,21000094,2010-11-07


In [669]:
home_cols = ["home_"+x for x in player_stats.columns[11:]]
away_cols = ["away_"+x for x in player_stats.columns[11:]]

cols = home_cols
pd.DataFrame(matchup_features[1011][0], columns=cols)


Unnamed: 0,home_MIN,home_FG3M,home_FG3A,home_FTM,home_FTA,home_OREB,home_DREB,home_AST,home_TOV,home_STL,home_BLK,home_BLKA,home_PF,home_PFD,home_PTS,home_PLUS_MINUS,home_E_OFF_RATING,home_E_DEF_RATING,home_AST_RATIO,home_TM_TOV_PCT,home_E_USG_PCT,home_E_PACE,home_POSS,home_PIE,home_FG2M,home_FG2A,home_PTS_2PT_MR,home_PTS_FB,home_PTS_OFF_TOV,home_PTS_PAINT,home_AST_2PM,home_UAST_2PM,home_AST_3PM,home_UAST_3PM
0,37.413167,2.4,5.5,6.8,7.5,0.4,5.1,3.1,1.4,1.3,0.5,0.5,1.8,4.4,25.8,8.5,111.68,100.32,12.45,5.48,0.2677,93.921,72.2,0.1717,5.9,12.2,6.0,4.6,3.7,5.3,3.5,2.3,2.4,0.0
1,33.202833,0.5,1.2,4.6,5.5,1.0,2.1,7.6,4.2,2.1,0.2,1.2,1.8,4.0,21.3,5.3,111.75,103.11,23.86,13.02,0.3322,93.403,64.1,0.1557,7.6,16.0,2.4,5.3,4.1,11.8,1.5,5.6,0.2,0.3
2,30.2245,0.0,0.0,1.1,1.9,2.9,6.6,0.5,0.8,0.7,3.6,0.6,3.5,1.6,10.9,4.1,108.99,101.51,4.16,6.27,0.1743,94.292,58.7,0.1179,4.9,9.7,2.6,1.6,0.9,6.9,3.1,1.4,0.0,0.0
3,28.758,1.8,4.7,3.9,4.5,0.3,2.4,2.5,2.1,1.2,0.3,0.8,2.9,3.3,16.1,10.8,114.58,95.14,13.84,12.36,0.2461,92.805,55.0,0.1219,3.4,6.0,1.5,3.8,3.7,5.1,1.5,1.5,1.5,0.2
4,24.448667,0.0,0.0,2.2,3.8,2.0,5.3,0.9,2.4,0.7,0.9,0.8,3.5,2.6,7.2,3.7,106.35,97.97,8.93,25.43,0.1548,96.336,47.3,0.0788,2.5,4.3,0.2,0.2,1.1,4.6,1.5,1.0,0.0,0.0
5,24.298167,0.3,1.4,0.4,0.6,0.6,2.8,1.2,0.8,1.0,0.8,0.3,1.3,0.7,4.5,1.2,106.48,102.74,16.58,11.08,0.1046,93.919,47.4,0.0719,1.6,2.9,1.1,0.8,1.2,2.0,0.9,0.5,0.3,0.0
6,23.185833,0.0,0.0,0.8,1.4,1.4,2.9,1.5,0.3,0.7,0.2,0.5,2.5,1.7,6.0,10.0,118.14,96.21,21.83,2.82,0.1091,91.556,44.2,0.0797,2.6,4.5,1.6,0.0,0.6,3.4,1.7,0.7,0.0,0.0
7,17.276667,0.0,0.0,0.3,0.6,1.8,2.6,0.5,0.6,0.8,0.4,0.6,2.5,1.2,6.7,4.2,110.1,100.51,6.0,15.84,0.1843,90.138,32.9,0.0845,3.2,5.8,2.0,0.2,0.8,4.1,1.8,1.1,0.0,0.0
8,15.297167,0.3,0.9,0.2,0.2,0.4,1.3,4.4,0.8,0.3,0.1,0.3,0.8,0.5,2.5,4.9,107.95,92.3,52.3,9.0,0.1225,91.763,29.9,0.0985,0.7,2.2,0.2,0.6,0.6,1.1,0.1,0.6,0.3,0.0
9,13.856,1.6,3.7,0.0,0.1,0.2,1.8,0.4,0.2,0.4,0.1,0.1,1.1,0.0,5.4,1.7,101.98,99.47,8.69,4.24,0.1424,90.529,26.4,0.0064,0.3,0.4,0.6,0.0,1.5,0.0,0.3,0.0,1.6,0.0


(9138, 10, 68)

In [642]:
# Split up data - training set and testing set
X_train, X_test, y_train, y_test = train_test_split(matchup_features, targets, test_size=0.2)


print("X_train shape:", X_train.shape,
      "\nX_test shape:", X_test.shape,
      "\ny_test shape:", y_train.shape,
      "\ny_train shape:", y_test.shape)

X_train shape: (9138, 2, 10, 34) 
X_test shape: (2285, 2, 10, 34) 
y_test shape: (9138,) 
y_train shape: (2285,)


In [643]:
class PlayerBoxScoreMatchupsDataset(Dataset):
    """"""
    def __init__(self, matchup_features, targets):
        self.targets = targets
        self.matchup_data = matchup_features
        
    def __len__(self):
        return len(self.matchup_data)
    
    def __getitem__(self, index):
        X = self.matchup_data[index]
        y = self.targets[index]
        
        return X, y
    

In [646]:
# Create DataSet and DataLoaders

training_set = PlayerBoxScoreMatchupsDataset(X_train[:5], y_train[:5])
validation_set = PlayerBoxScoreMatchupsDataset(X_test, y_test)

train_loader = DataLoader(training_set, batch_size=100, shuffle=False)
val_loader = DataLoader(validation_set, batch_size=100, shuffle=False)

print(training_set[4][0].shape, training_set[0][1])

(2, 10, 34) -14


In [647]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv1d(2, 2, kernel_size=(10,1), stride=1)
        self.conv1_bn = nn.BatchNorm2d(2)
        
        self.fc1 = nn.Linear(68, 64)
        self.fc1_bn = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 16)
        self.fc2_bn = nn.BatchNorm1d(16)
        self.fc3 = nn.Linear(16, 1)
        
        self.dropout = nn.Dropout(0.25)
        
        
    def forward(self, x):
        
        x = self.conv1_bn(F.relu(self.conv1(x)))
#         print("shape after conv1:", x.shape)
        x = x.reshape(-1, 68)
        x = self.dropout(x)
        x = self.fc1_bn(F.relu(self.fc1(x)))
        x = self.dropout(x)
        x = self.fc2_bn(F.relu(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        
        return x
    
net = Net()
print(net)

Net(
  (conv1): Conv1d(2, 2, kernel_size=(10, 1), stride=(1,))
  (conv1_bn): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=68, out_features=64, bias=True)
  (fc1_bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc2_bn): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


In [651]:
import torch.optim as optim

net = Net()
loss_function = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.03)

epochs = 100

for e in range(epochs):
    train_loss = 0
    val_loss = 0
    net.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = net(data.float())
        if e == 99:
            print("first output of batch:", output[0])
            print("first target of batch:", target[0])
        loss = loss_function(output, target.float())
        
        loss.backward()
        
        optimizer.step()
        train_loss += loss.item() * data.size(0)
        
#     net.eval()
#     for data, target in val_loader:
#         output = net(data.float())
#         loss = loss_function(output, target.float())
        
#         val_loss += loss.item()*data.size(0)
        
    train_loss = train_loss / len(train_loader.sampler)
    
#     val_loss = val_loss / len(val_loader.sampler)

    if e % 5 == 0:
        print('Epoch: {} \tTraining Loss: {:.4f} \tValidation Loss: {:.4f}'.format(
        e, train_loss, val_loss))
    

Epoch: 0 	Training Loss: 126.3365 	Validation Loss: 0.0000
Epoch: 5 	Training Loss: 122.8067 	Validation Loss: 0.0000
Epoch: 10 	Training Loss: 120.2158 	Validation Loss: 0.0000
Epoch: 15 	Training Loss: 111.6602 	Validation Loss: 0.0000
Epoch: 20 	Training Loss: 107.2012 	Validation Loss: 0.0000
Epoch: 25 	Training Loss: 108.3113 	Validation Loss: 0.0000
Epoch: 30 	Training Loss: 108.2531 	Validation Loss: 0.0000
Epoch: 35 	Training Loss: 106.8875 	Validation Loss: 0.0000
Epoch: 40 	Training Loss: 108.6361 	Validation Loss: 0.0000
Epoch: 45 	Training Loss: 107.5872 	Validation Loss: 0.0000
Epoch: 50 	Training Loss: 107.4210 	Validation Loss: 0.0000
Epoch: 55 	Training Loss: 108.2122 	Validation Loss: 0.0000
Epoch: 60 	Training Loss: 107.2624 	Validation Loss: 0.0000
Epoch: 65 	Training Loss: 107.0705 	Validation Loss: 0.0000
Epoch: 70 	Training Loss: 106.9041 	Validation Loss: 0.0000
Epoch: 75 	Training Loss: 107.8922 	Validation Loss: 0.0000
Epoch: 80 	Training Loss: 107.5011 	Valida

In [662]:
for data, label, in train_loader:
    print(net(data.float()))
    print(label)

tensor([[-4.0484],
        [-3.5129],
        [-5.0683],
        [-4.3891],
        [-3.1561]], grad_fn=<AddmmBackward>)
tensor([-14, -12,  -7,  15,  -4])


In [None]:
print()

In [352]:
 np.empty((1,0))

array([], shape=(1, 0), dtype=float64)

In [648]:
preds = np.empty((1,0))
for data, target in val_loader:
    
    output = net(data.float())
    preds  = np.append(preds, output.detach().numpy().T)

    
    
preds
    

array([ 0.81842214,  0.4361662 ,  0.91527337, ..., -0.16868234,
        0.06047469,  0.06885672])

In [649]:
pd.DataFrame(preds)

Unnamed: 0,0
0,0.818422
1,0.436166
2,0.915273
3,-0.379051
4,-0.287050
...,...
2280,0.641530
2281,0.032958
2282,-0.168682
2283,0.060475


## Gather Betting Data

In [359]:
# read betting data
betting_data = pd.read_csv('./data/nba_betting_data_2010_present.csv')
betting_data = betting_data.dropna()
    
def clean_bet_data(df = betting_data):

    
    df['date'] = pd.to_datetime(df['date'])
    df['home_team_abbr'] = df['home_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'
                                                                                        }
                                                                             )
    df['away_team_abbr'] = df['away_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'}
                                                                           )

    teams = df['home_team_abbr'].unique()

    df = df.sort_values(['date'])
    df['rest'] = np.nan
    for team in teams:
        team_data = df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team)]
        team_data['rest'] = (team_data['date'] - team_data['date'].shift(1)) / np.timedelta64(1, 'D')
        df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team), 'rest'] = team_data['rest']


    df['point_diff'] = df['home_score'] - df['away_score']
    df['point_total'] = df['home_score'] + df['away_score']
    df['covered_spread'] = (df['home_score'] + df['spread'] > df['away_score']).astype(int)
    df['over'] = (df['point_total'] > df['total']).astype(int)
    
    df['prev_cover'] = df['covered_spread'].shift(1)
    df['prev2_cover'] = df['covered_spread'].shift(2)
    
    relevant_betting = df[['date', 'home_team_abbr',  'away_team_abbr',
                           'home_score', 'away_score', 'point_diff', 
                           'rest', 'point_total', 'spread', 'total',
                          'covered_spread', 'over', 'prev_cover', 'prev2_cover']]

    return relevant_betting

bet_data_clean = clean_bet_data()

array([ 12, -19,  12, ...,   1,  11,   8], dtype=int64)

In [360]:
bet_data_clean[['']]

Unnamed: 0,date,home_team_abbr,away_team_abbr,home_score,away_score,point_diff,rest,point_total,spread,total,covered_spread,over,prev_cover,prev2_cover
1012,2011-12-25,NYK,BOS,106,104,2,,210,-5.0,190.5,0,1,,
1013,2011-12-25,DAL,MIA,94,105,-11,,199,4.5,188.5,0,1,0.0,
1014,2011-12-25,LAL,CHI,87,88,-1,,175,4.5,183.5,1,0,0.0,0.0
1015,2011-12-25,OKC,ORL,97,89,8,,186,-7.5,194.5,1,0,1.0,0.0
1016,2011-12-25,GSW,LAC,86,105,-19,,191,6.0,207.5,0,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11935,2021-02-04,LAL,DEN,114,93,21,4.0,207,-5.0,217.0,1,0,1.0,1.0
11934,2021-02-04,MEM,HOU,103,115,-12,1.0,218,-3.5,226.0,0,0,1.0,1.0
11933,2021-02-04,PHI,POR,105,121,-16,1.0,226,-10.0,223.0,0,1,0.0,1.0
11932,2021-02-04,DAL,GSW,116,147,-31,2.0,263,-4.0,229.5,0,1,0.0,0.0
