In [78]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

from tqdm import tqdm
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguegamelog

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

pd.options.display.max_columns=200

In [18]:
seasons = ["20{}-{}".format(x, x+1) for x in range(10, 21)]
print(seasons)
def gather_data(seasons = seasons):
    full_player_data = []
    for season in seasons:
        player_gls = pd.read_csv('./data/player_gamelogs_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_adv = pd.read_csv('./data/player_advanced_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_scoring = pd.read_csv('./data/player_scoring_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
#         player_gls_tracking = pd.read_csv('./data/player_tracking_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
    
        print("player_gls_shape:", player_gls.shape,
             "player_gls_adv_shape:", player_gls_adv.shape,
             "player_gls_scoring_shape:", player_gls_scoring.shape)
    
        player_gls = player_gls[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK',
       'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']]

        player_full = pd.merge(player_gls, player_gls_adv, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_scoring, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
    
        print("after_merging shape", player_full.shape)
        
        # drop unnecessary columns
        player_full = player_full.drop(columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ID_x',
                         'TEAM_ABBREVIATION_x','TEAM_CITY','PLAYER_NAME_x','START_POSITION',
                          'COMMENT','MIN_x','TEAM_ID_x', 'TEAM_ABBREVIATION_x','TEAM_CITY_x', 
                          'PLAYER_NAME_x', 'START_POSITION_x', 'COMMENT_x', 'MIN_x', 
                         'TEAM_ID_x', 'TEAM_ABBREVIATION_x', 'TEAM_CITY_x', 'PLAYER_NAME_x', 
                          'START_POSITION_x', 'COMMENT_x', 'MIN_x', 'AST_x', 'FG_PCT_x',
                         'FG_PCT', 'FG3_PCT', 'FT_PCT', 'E_NET_RATING', 'NET_RATING', 
                         'AST_PCT', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'EFG_PCT', 'TS_PCT', 
                         'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_FT', 'CFG_PCT',
                        'UFG_PCT', 'DFG_PCT'], 
                       errors= 'ignore')

        # Convert date to datetime
        player_full['GAME_DATE'] = pd.to_datetime(player_full['GAME_DATE'])
        
        # Convert W/L to a binary 1/0 for win/loss
        player_full['WL'] = (player_full['WL'].str[0] == 'W').astype(int)
        
        # Add a binary home_game feature
        player_full['HOME_GAME'] = 0 
        player_full['HOME_GAME'] = (player_full['MATCHUP'].str[:6] == player_full['TEAM_ABBREVIATION'].str[:] + ' vs').astype(int)
        
        # convert the percentages into raw numbers (because we will be averaging them, we don't want to average percentages)
        # for example, percentage of points from midrange will be points scored from midrange
        player_full['FG2M'] = player_full['FGM'] - player_full['FG3M']
        player_full['FG2A'] = player_full['FGA'] - player_full['FG3A']
        player_full['PTS_2PT'] = (player_full['PTS'] * player_full['PCT_PTS_2PT']).astype('int8')
        player_full['PTS_2PT_MR'] = (player_full['PTS'] * player_full['PCT_PTS_2PT_MR']).astype('int8')
        player_full['PTS_3PT'] = (player_full['PTS'] * player_full['PCT_PTS_3PT']).astype('int8')
        player_full['PTS_FB'] = (player_full['PTS'] * player_full['PCT_PTS_FB']).astype('int8')
        player_full['PTS_OFF_TOV'] = (player_full['PTS'] * player_full['PCT_PTS_OFF_TOV']).astype('int8')
        player_full['PTS_PAINT'] = (player_full['PTS'] * player_full['PCT_PTS_PAINT']).astype('int8')
        player_full['AST_2PM'] = (player_full['FG2M'] * player_full['PCT_AST_2PM']).astype('int8')
        player_full['UAST_2PM'] = (player_full['FG2M'] * player_full['PCT_UAST_2PM']).astype('int8')
        player_full['AST_3PM'] = (player_full['FG3M'] * player_full['PCT_AST_3PM']).astype('int8')
        player_full['UAST_3PM'] = (player_full['FG3M'] * player_full['PCT_UAST_3PM']).astype('int8')



        player_full = player_full.drop(columns = ['PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 
                                                  'PCT_PTS_FB', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT',
                                                  'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                                                  'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 
                                                  'FGM', 'FGA', 'AST_TOV', 'USG_PCT', 'PACE',
                                                  'PACE_PER40', ], errors='ignore')
        
        
        full_player_data.append(player_full)
        
        
    
    player_df = pd.concat(full_player_data)
    
    
    player_df['home_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 1 else row['MATCHUP'][-3:], axis=1)
    player_df['away_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 0 else row['MATCHUP'][-3:], axis=1)
    
    player_df[['home_team_abbr', 'away_team_abbr']] = player_df[['home_team_abbr', 'away_team_abbr']].replace({'NOH':'NOP',
                                                                                                               'NJN':'BKN'})
    
    # Reorder columns
    player_df = player_df[['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID', 
                           'home_team_abbr', 'away_team_abbr', 'GAME_ID', 
                           'GAME_DATE', 'MATCHUP', 'WL', 'HOME_GAME', 'MIN',
                           'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
                           'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD',
                           'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'E_DEF_RATING',
                           'AST_RATIO', 'TM_TOV_PCT', 'E_USG_PCT',
                           'E_PACE', 'POSS', 'PIE',
                           'FG2M', 'FG2A', 'PTS_2PT_MR', 'PTS_FB', 
                           'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
                           'UAST_2PM', 'AST_3PM', 'UAST_3PM']]
    
    return player_df

player_df = gather_data(seasons)

['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21']
player_gls_shape: (25153, 64) player_gls_adv_shape: (31419, 31) player_gls_scoring_shape: (29475, 24)
after_merging shape (25153, 83)
player_gls_shape: (20758, 64) player_gls_adv_shape: (27638, 31) player_gls_scoring_shape: (25455, 24)
after_merging shape (20758, 83)
player_gls_shape: (25757, 64) player_gls_adv_shape: (33542, 31) player_gls_scoring_shape: (31338, 24)
after_merging shape (25757, 83)
player_gls_shape: (25618, 64) player_gls_adv_shape: (33681, 31) player_gls_scoring_shape: (31378, 24)
after_merging shape (25618, 83)
player_gls_shape: (25981, 64) player_gls_adv_shape: (33522, 31) player_gls_scoring_shape: (31412, 24)
after_merging shape (25981, 83)
player_gls_shape: (26078, 64) player_gls_adv_shape: (33659, 31) player_gls_scoring_shape: (31423, 24)
after_merging shape (26078, 83)
player_gls_shape: (26139, 64) player_gls_adv_shape: (33610, 31) player_gls

In [88]:
player_df

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
0,2010-11,MIL,Corey Maggette,1894,OKC,MIL,21001225,2011-04-13,MIL @ OKC,1,0,18.776667,0,0,6,6,0,1,2,2,1,0,0,2,3,12,2,111.0,105.4,16.7,16.7,0.239,96.89,39,0.139,3,5,3,2,0,2,3,0,0,0
1,2010-11,NOH,Jarrett Jack,101127,DAL,NOP,21001223,2011-04-13,NOH @ DAL,0,0,28.658333,1,1,5,6,0,0,7,1,0,1,0,2,4,22,-22,100.3,137.0,28.0,4.0,0.296,95.91,56,0.144,7,14,6,2,0,8,2,4,1,0
2,2010-11,NJN,Jordan Farmar,200770,CHI,BKN,21001222,2011-04-13,NJN @ CHI,0,0,38.166667,2,6,5,6,0,4,12,4,0,2,0,1,4,21,7,107.9,102.8,35.3,11.8,0.274,91.28,71,0.198,5,9,3,1,6,6,0,5,1,1
3,2010-11,NYK,Amar'e Stoudemire,2405,BOS,NYK,21001216,2011-04-13,NYK @ BOS,0,0,20.450000,0,0,2,3,2,2,1,2,0,1,0,1,4,14,-3,107.6,107.5,5.3,10.5,0.384,101.49,40,0.072,6,15,4,0,0,7,1,4,0,0
4,2010-11,OKC,Byron Mullens,201957,OKC,MIL,21001225,2011-04-13,OKC vs. MIL,0,1,29.178333,0,0,2,4,1,4,0,2,1,0,1,2,2,10,-22,90.1,130.8,0.0,15.4,0.191,97.12,60,0.041,4,9,0,1,4,8,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8689,2020-21,GSW,Jordan Poole,1629673,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,17.916667,0,1,0,0,0,2,3,2,0,0,0,2,1,2,-3,83.8,94.8,33.3,22.2,0.135,103.63,38,0.000,1,3,0,0,0,2,0,1,0,0
8690,2020-21,GSW,Andrew Wiggins,203952,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,31.233333,2,6,3,4,0,2,1,4,0,1,2,4,4,13,-28,74.4,115.8,4.3,17.4,0.262,111.88,72,-0.044,2,10,2,0,0,2,1,1,2,0
8691,2020-21,GSW,Juan Toscano-Anderson,1629308,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,13.250000,0,0,0,0,0,4,2,0,0,2,0,3,1,4,-9,70.5,95.5,50.0,0.0,0.058,119.84,33,0.208,2,2,0,2,0,4,2,0,0,0
8692,2020-21,LAC,Luke Kennard,1628379,LAL,LAC,22000002,2020-12-22,LAC @ LAL,1,0,21.055000,0,3,0,0,0,2,2,1,2,1,0,3,0,4,-3,116.9,112.2,22.2,11.1,0.144,104.59,44,0.032,2,3,0,0,0,4,1,1,0,0


In [292]:
def aggregate_player_stats(df = player_df):

    seasons = player_df['SEASON_YEAR'].unique()

    df = df.sort_values('GAME_DATE')

    for season in seasons:
        season_df = df.loc[df['SEASON_YEAR'] == season]
        player_ids = season_df['PLAYER_ID'].unique()
        for p_id in tqdm(player_ids, desc='Progress'):
            player_log = season_df.loc[season_df['PLAYER_ID'] == p_id]

            avg_player_data = player_log.iloc[:, 11:].shift(1).rolling(10, min_periods=5).mean()

            cols = avg_player_data.columns

            df.loc[(df['SEASON_YEAR'] == season) & (df['PLAYER_ID'] == p_id), cols] = avg_player_data

    df.to_csv('player_avg_last10.csv', index=False)
        
    return df
    
player_stats = aggregate_player_stats(df=player_df)

Progress: 100%|██████████████████████████████████████████████████████████████████████| 452/452 [00:47<00:00,  9.52it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 478/478 [00:54<00:00,  8.72it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 469/469 [01:01<00:00,  7.61it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 482/482 [00:53<00:00,  8.97it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 492/492 [00:52<00:00,  9.34it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 476/476 [00:52<00:00,  9.00it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 486/486 [00:53<00:00,  9.07it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 540/540 [00:55<00:00,  9.65it/s]
Progress: 100%|█████████████████████████

In [293]:
player_stats.loc[player_stats['GAME_ID']==21000075].sort_values('MIN', ascending=False)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
23603,2010-11,MIA,LeBron James,2544,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,34.323333,1.0,3.4,6.2,8.0,0.0,4.6,7.2,5.2,1.4,0.8,0.0,1.8,4.4,20.4,14.0,109.96,83.66,24.92,16.64,0.3224,94.648,66.8,0.1922,5.6,10.6,3.8,4.6,4.8,7.2,1.8,3.4,1.0,0.0
23527,2010-11,MIA,Chris Bosh,2547,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.736667,0.0,0.2,3.8,4.6,1.0,5.4,1.6,0.8,1.0,1.0,0.8,2.2,2.8,13.0,15.0,117.26,87.02,10.52,4.7,0.2164,92.952,60.6,0.1264,4.6,10.6,4.0,0.0,2.0,4.6,2.6,1.6,0.0,0.0
23563,2010-11,MIA,Dwyane Wade,2548,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.570667,1.0,2.6,5.8,8.0,1.2,3.8,4.2,3.0,1.6,1.0,1.0,2.8,5.4,22.4,10.8,113.96,90.06,15.52,10.78,0.3498,93.192,60.0,0.1774,6.8,13.8,1.6,4.8,5.6,11.6,1.8,4.2,0.2,0.8
23730,2010-11,MIA,James Jones,2592,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,27.129,3.6,6.8,0.0,0.0,0.2,2.2,0.8,0.0,0.6,0.4,0.0,2.0,1.2,11.6,6.2,105.4,91.18,10.04,0.0,0.1364,94.588,53.6,0.1214,0.4,1.0,0.8,1.0,2.4,0.0,0.4,0.0,3.6,0.0
23685,2010-11,MIA,Carlos Arroyo,2306,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,23.359667,0.4,0.8,1.2,1.4,0.4,3.2,2.4,1.0,0.6,0.0,0.0,1.4,0.8,7.2,13.4,111.9,82.1,23.26,9.62,0.1394,91.746,43.8,0.1214,2.4,4.6,4.2,0.2,0.8,0.2,1.4,0.6,0.4,0.0
23585,2010-11,MIA,Udonis Haslem,2617,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.872,0.0,0.0,1.6,2.0,1.8,7.0,0.2,0.8,0.4,0.4,0.4,3.6,2.4,7.2,2.8,98.2,91.04,2.22,12.52,0.151,96.178,45.8,0.1274,2.8,5.8,3.6,0.8,1.0,1.6,2.4,0.4,0.0,0.0
23712,2010-11,MIA,Eddie House,2067,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.689667,2.0,3.2,1.0,1.2,0.0,1.8,1.8,0.8,1.4,0.0,0.2,1.4,0.6,9.0,-0.2,98.72,98.4,19.54,7.78,0.1596,93.754,44.4,0.1304,1.0,3.4,1.8,1.0,0.6,0.0,0.6,0.4,1.8,0.2
23518,2010-11,MIA,Joel Anthony,201202,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,20.329333,0.0,0.0,0.6,1.2,1.2,3.2,0.6,0.4,0.4,1.4,0.0,3.4,1.0,1.4,7.8,101.32,76.84,20.0,30.0,0.0406,91.318,37.6,0.054,0.4,0.6,0.0,0.4,0.2,0.8,0.2,0.2,0.0,0.0
23728,2010-11,MIA,Zydrunas Ilgauskas,980,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,14.649667,0.0,0.0,0.8,1.2,2.2,2.0,0.6,0.6,0.6,0.8,0.0,3.6,1.0,6.0,10.2,127.04,85.52,7.3,15.56,0.19,97.444,29.8,0.0874,2.6,4.8,3.6,0.0,0.6,1.4,2.0,0.4,0.0,0.0
23581,2010-11,NOH,Chris Paul,101108,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [294]:
# Drop Null Rows
player_stats = player_stats.dropna(axis=0)

In [276]:
player_stats['GAME_ID'].unique()[2]

21000075

In [291]:
player_df.loc[player_df['GAME_ID']==21000075].sort_values('MIN', ascending=False)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
23603,2010-11,MIA,LeBron James,2544,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,41.2,1,2,7,8,0,7,10,3,3,1,0,0,4,20,-6,97.9,111.5,30.3,9.1,0.259,92.53,80,0.175,5,14,4,4,2,6,1,4,0,1
23563,2010-11,MIA,Dwyane Wade,2548,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,39.873333,1,5,13,13,2,8,2,7,2,2,1,1,10,28,-7,99.3,113.1,6.7,23.3,0.331,92.52,75,0.183,6,11,0,1,8,12,1,4,1,0
23745,2010-11,NOH,Trevor Ariza,2772,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,36.378333,3,7,0,0,0,6,1,2,1,0,1,4,1,13,9,108.3,93.8,6.3,12.5,0.194,92.13,71,0.055,2,6,0,4,5,4,2,0,3,0
23581,2010-11,NOH,Chris Paul,101108,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,36.283333,0,2,3,3,0,2,19,3,5,0,2,4,4,13,9,117.1,99.2,52.8,8.3,0.218,93.74,69,0.184,5,11,6,2,0,4,0,5,0,0
23577,2010-11,NOH,Emeka Okafor,2731,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,34.953333,0,0,2,3,4,9,1,1,0,1,0,2,2,26,0,108.1,101.6,6.3,6.3,0.209,90.47,63,0.279,12,13,0,0,4,23,8,3,0,0
23527,2010-11,MIA,Chris Bosh,2547,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,33.553333,1,1,0,0,0,1,1,1,0,0,0,1,5,15,-2,104.9,117.2,6.7,6.7,0.19,92.96,64,0.073,6,12,10,0,1,1,6,0,1,0
23718,2010-11,NOH,Marco Belinelli,201158,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,31.07,0,2,2,2,0,1,3,1,0,0,0,3,3,8,4,116.4,103.5,25.0,8.3,0.139,90.1,58,0.037,3,5,2,2,2,4,3,0,0,0
23544,2010-11,NOH,David West,2561,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,28.22,0,0,5,6,4,3,0,2,1,0,1,4,3,15,-8,106.1,112.7,0.0,11.8,0.284,91.65,53,0.071,5,12,1,1,1,7,3,2,0,0
23730,2010-11,MIA,James Jones,2592,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,27.993333,2,6,0,0,0,1,2,0,0,1,0,3,0,6,15,110.0,85.2,25.0,0.0,0.099,95.61,55,0.028,0,0,0,0,0,0,0,0,2,0
23585,2010-11,MIA,Udonis Haslem,2617,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,25.781667,0,0,2,2,1,5,0,0,0,0,0,4,3,8,5,102.8,91.9,0.0,0.0,0.149,94.73,50,0.071,3,7,2,0,0,4,2,0,0,0


In [307]:
def prep_data_for_model(df = player_stats):
    matchup_info = []
    matchups = []
    targets = []
    
    all_team_gamelogs = pd.read_csv('./data/team_gamelogs_2000_present.csv')

    game_results = all_team_gamelogs.loc[all_team_gamelogs['MATCHUP'].str.contains('vs'), ['GAME_ID', 'PLUS_MINUS']]
    game_results

    for game_id in tqdm(df['GAME_ID'].unique()[:20], desc="progress"):
        game_df = df.loc[df['GAME_ID'] == game_id]
        
        point_diff = game_results.loc[game_results['GAME_ID'] == game_id, 'PLUS_MINUS'].values[0]
        targets.append(point_diff)

        home = game_df.loc[game_df['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
        home_stats = home.iloc[:10, 11:].values
        
        away = game_df.loc[game_df['HOME_GAME'] == 0].sort_values('MIN', ascending=False)
        away_stats = away.iloc[:10, 11:].values
                
        # If a team had one of their game's removed (because it was in the first 5 games of the season), remove the entire game
        if (home_stats.shape[0] == 0) or (away_stats.shape[0] == 0):
            continue
        
        if home_stats.shape[0] < 10:
            missing_rows = 10 - home_stats.shape[0]
            padding = np.zeros((missing_rows, home_stats.shape[1]))
            home_stats = np.concatenate([home_stats, padding], axis=0)
        
        if away_stats.shape[0] < 10:
            missing_rows = 10 - away_stats.shape[0]
            padding = np.zeros((missing_rows, away_stats.shape[1]))
            away_stats = np.concatenate([away_stats, padding], axis=0)

        print(home_stats.shape, away_stats.shape)
        matchup = np.concatenate([home_stats, away_stats], axis=1)
        matchups.append(matchup)

    matchup_features = np.stack(matchups, axis=0)
    return matchup_features, np.array(targets)

matchup_features, targets = prep_data_for_model(df = player_stats)

progress:  90%|███████████████████████████████████████████████████████████████▉       | 18/20 [00:00<00:00, 235.43it/s]

7
(10, 34) (10, 34)
8
(10, 34) (10, 34)
9
(10, 34) (10, 34)
7
(10, 34) (10, 34)
8
(10, 34) (10, 34)
9
(10, 34) (10, 34)
11
(10, 34) (11, 34)





ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [303]:
matchup_features.shape

(1, 10, 68)

In [299]:
home_cols = ["home_"+x for x in player_stats.columns[11:]]
away_cols = ["away_"+x for x in player_stats.columns[11:]]

cols = home_cols + away_cols
pd.DataFrame(matchup_features[1], columns=cols)


IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:
def view_data(matchup_data, game_id):
    

In [136]:
# Split up data - training set and testing set
X_train, X_test, y_train, y_test = train_test_split(matchup_data, targets, test_size=0.2)

X_train.shape, X_test.shape

((9212, 10, 68), (2303, 10, 68))

In [198]:
torch.tensor([0,1,2,4]).float()

tensor([0., 1., 2., 4.])

In [205]:
class PlayerBoxScoreMatchupsDataset(Dataset):
    """"""
    def __init__(self, matchup_data, targets):
        self.targets = targets
        self.matchup_data = matchup_data
        
    def __len__(self):
        return len(self.matchup_data)
    
    def __getitem__(self, index):
        X = matchup_data[index]
        y = targets[index]
        
        return X, y
    

In [206]:
training_set = PlayerBoxScoreMatchupsDataset(X_train, y_train)
validation_set = PlayerBoxScoreMatchupsDataset(X_test, y_test)

train_loader = DataLoader(training_set, batch_size=100, shuffle=False)
val_loader = DataLoader(training_set, batch_size=100, shuffle=False)
for i, batch in enumerate(loader): 
    print("idx:", i)
    print("features:", batch[0])
    print("targets:", batch[1])
    break

idx: 0
features: tensor([[[35.4694,  1.0000,  3.1667,  ...,  1.4000,  0.0000,  0.0000],
         [32.9544,  1.0000,  3.0000,  ...,  0.8000,  0.8000,  0.0000],
         [32.0394,  0.1667,  0.3333,  ...,  4.2000,  1.0000,  0.2000],
         ...,
         [19.2931,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.0000],
         [15.9469,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[35.4694,  1.0000,  3.1667,  ...,  1.4000,  0.0000,  0.0000],
         [32.9544,  1.0000,  3.0000,  ...,  0.8000,  0.8000,  0.0000],
         [32.0394,  0.1667,  0.3333,  ...,  4.2000,  1.0000,  0.2000],
         ...,
         [19.2931,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.0000],
         [15.9469,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[35.4694,  1.0000,  3.1667,  ...,  1.4000,  0.0000,  0.0000],
         [32.9544,  1.0000, 

In [207]:
type(matchup_data[0])

numpy.ndarray

In [240]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv1d(10, 5, kernel_size=1, stride=1)
        
        self.fc1 = nn.Linear(68*5, 1)
#         self.fc2 = nn.Linear(64, 16)
#         self.fc3 = nn.Linear(16, 1)
        
#         self.dropout = nn.Dropout(0.25)
        
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
#         print(x.shape)
        x = x.view(-1, 68*5)
#         print(x.shape)
#         x = self.dropout(x)
        x = self.fc1(x)
#         x = self.dropout(x)
#         x = F.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.fc3(x)
        
        return x
    
net = Net()
print(net)

Net(
  (conv1): Conv1d(10, 5, kernel_size=(1,), stride=(1,))
  (fc1): Linear(in_features=340, out_features=1, bias=True)
)


In [238]:
len(val_loader.sampler)

9212

In [243]:
import torch.optim as optim

loss_function = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)

epochs = 10

for e in range(epochs):
    train_loss = 0
    val_loss = 0
    net.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        
        output = net(data.float())
#         print(output.shape)
#         print(output)
#         print(target)
        loss = loss_function(output, target.float())
        
        loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item() * data.size(0)
        
    net.eval()
    for data, target in val_loader:
        output = net(data.float())
        loss = loss_function(output, target.float())
        
        val_loss += loss.item()*data.size(0)
        
    train_loss = train_loss / len(train_loader.sampler)
    
    val_loss = val_loss /len(val_loader.sampler)


    print('Epoch: {} \tTraining Loss: {:.4f} \tValidation Loss: {:.4f}'.format(
    e, train_loss, val_loss))
    

Epoch: 0 	Training Loss: 179.1169 	Validation Loss: 191.6084
Epoch: 1 	Training Loss: 178.7357 	Validation Loss: 178.5898
Epoch: 2 	Training Loss: 178.4589 	Validation Loss: 178.4333
Epoch: 3 	Training Loss: 178.4411 	Validation Loss: 178.3990
Epoch: 4 	Training Loss: 178.4146 	Validation Loss: 178.5805
Epoch: 5 	Training Loss: 178.4451 	Validation Loss: 178.5486
Epoch: 6 	Training Loss: 178.4373 	Validation Loss: 178.5127
Epoch: 7 	Training Loss: 178.4458 	Validation Loss: 178.4289
Epoch: 8 	Training Loss: 178.4294 	Validation Loss: 178.5136
Epoch: 9 	Training Loss: 178.4349 	Validation Loss: 178.4540


In [188]:
net(test_input.type(torch.float))

torch.Size([100, 1, 68])
torch.Size([100, 68])


tensor([[-0.1018],
        [-0.0696],
        [-0.0693],
        [-0.1368],
        [-0.0764],
        [-0.0737],
        [-0.1385],
        [-0.0279],
        [-0.0863],
        [-0.0772],
        [-0.0931],
        [-0.0613],
        [-0.0631],
        [-0.0309],
        [-0.1466],
        [-0.0946],
        [-0.0900],
        [-0.0721],
        [-0.0578],
        [-0.1200],
        [-0.0390],
        [-0.0851],
        [ 0.0129],
        [-0.0701],
        [-0.1577],
        [-0.0531],
        [-0.0988],
        [-0.0422],
        [-0.1033],
        [-0.1155],
        [-0.0796],
        [-0.0670],
        [-0.1114],
        [-0.1087],
        [-0.0338],
        [-0.1232],
        [-0.0717],
        [-0.0271],
        [-0.1342],
        [-0.1936],
        [-0.1077],
        [-0.0828],
        [-0.1227],
        [-0.1213],
        [-0.0318],
        [-0.1139],
        [-0.0808],
        [-0.0140],
        [-0.0160],
        [-0.0659],
        [ 0.0056],
        [-0.1207],
        [-0.

## Gather Betting Data

In [None]:
def clean_bet_data(df = betting_data):
    
    # read betting data
    betting_data = pd.read_csv('./data/nba_betting_data_2010_present.csv')
    betting_data = betting_data.dropna()
    
    
    df['date'] = pd.to_datetime(df['date'])
    df['home_team_abbr'] = df['home_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'
                                                                                        }
                                                                             )
    df['away_team_abbr'] = df['away_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'}
                                                                           )

    teams = df['home_team_abbr'].unique()

    df = df.sort_values(['date'])
    df['rest'] = np.nan
    for team in teams:
        team_data = df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team)]
        team_data['rest'] = (team_data['date'] - team_data['date'].shift(1)) / np.timedelta64(1, 'D')
        df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team), 'rest'] = team_data['rest']


    df['point_diff'] = df['home_score'] - df['away_score']
    df['point_total'] = df['home_score'] + df['away_score']
    df['covered_spread'] = (df['home_score'] + df['spread'] > df['away_score']).astype(int)
    df['over'] = (df['point_total'] > df['total']).astype(int)
    
    df['prev_cover'] = df['covered_spread'].shift(1)
    df['prev2_cover'] = df['covered_spread'].shift(2)
    
    relevant_betting = df[['date', 'home_team_abbr',  'away_team_abbr',
                           'home_score', 'away_score', 'point_diff', 
                           'rest', 'point_total', 'spread', 'total',
                          'covered_spread', 'over', 'prev_cover', 'prev2_cover']]

    return relevant_betting

bet_data_clean = clean_bet_data()