# Convolutional Neural Network to Predict NBA Games

## Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

from tqdm import tqdm
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguegamelog

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

pd.options.display.max_columns=200

In C:\Users\Jordan Nishimura\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Jordan Nishimura\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Jordan Nishimura\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\Jordan Nishimura\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Jordan Nishimura\Anaconda3\lib\site-

## Gathering Player Data

In [18]:
seasons = ["200{}-0{}".format(x, x+1) if x!=9 else "200{}-{}".format(x, x+1) for x in range(0, 10)]
seasons2 = ["20{}-{}".format(x, x+1) for x in range(10, 21)]
seasons.extend(seasons2)
print(seasons)

def gather_data(seasons = seasons):
    """This function pulls player boxscores from 2000-current season and merges them into one dataframe"""
    full_player_data = []
    for season in seasons:
        player_gls = pd.read_csv('./data/player_gamelogs_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_adv = pd.read_csv('./data/player_advanced_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_scoring = pd.read_csv('./data/player_scoring_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
#         player_gls_tracking = pd.read_csv('./data/player_tracking_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
    
        print("player_gls_shape:", player_gls.shape,
             "player_gls_adv_shape:", player_gls_adv.shape,
             "player_gls_scoring_shape:", player_gls_scoring.shape)
    
        player_gls = player_gls[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK',
       'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']]

        player_full = pd.merge(player_gls, player_gls_adv, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_scoring, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
    
        print("after_merging shape", player_full.shape)
        
        # drop unnecessary columns
        player_full = player_full.drop(columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ID_x',
                         'TEAM_ABBREVIATION_x','TEAM_CITY','PLAYER_NAME_x','START_POSITION',
                          'COMMENT','MIN_x','TEAM_ID_x', 'TEAM_ABBREVIATION_x','TEAM_CITY_x', 
                          'PLAYER_NAME_x', 'START_POSITION_x', 'COMMENT_x', 'MIN_x', 
                         'TEAM_ID_x', 'TEAM_ABBREVIATION_x', 'TEAM_CITY_x', 'PLAYER_NAME_x', 
                          'START_POSITION_x', 'COMMENT_x', 'MIN_x', 'AST_x', 'FG_PCT_x',
                         'FG_PCT', 'FG3_PCT', 'FT_PCT', 'E_NET_RATING', 'NET_RATING', 
                         'AST_PCT', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'EFG_PCT', 'TS_PCT', 
                         'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_FT', 'CFG_PCT',
                        'UFG_PCT', 'DFG_PCT'], 
                       errors= 'ignore')

        # Convert date to datetime
        player_full['GAME_DATE'] = pd.to_datetime(player_full['GAME_DATE'])
        
        # Convert W/L to a binary 1/0 for win/loss
        player_full['WL'] = (player_full['WL'].str[0] == 'W').astype(int)
        
        # Add a binary home_game feature
        player_full['HOME_GAME'] = 0 
        player_full['HOME_GAME'] = (player_full['MATCHUP'].str[:6] == player_full['TEAM_ABBREVIATION'].str[:] + ' vs').astype(int)
        
        # convert the percentages into raw numbers (because we will be averaging them, we don't want to average percentages)
        # for example, percentage of points from midrange will be points scored from midrange
        player_full['FG2M'] = player_full['FGM'] - player_full['FG3M']
        player_full['FG2A'] = player_full['FGA'] - player_full['FG3A']
        player_full['PTS_2PT'] = (player_full['PTS'] * player_full['PCT_PTS_2PT']).astype('int8')
        player_full['PTS_2PT_MR'] = (player_full['PTS'] * player_full['PCT_PTS_2PT_MR']).astype('int8')
        player_full['PTS_3PT'] = (player_full['PTS'] * player_full['PCT_PTS_3PT']).astype('int8')
        player_full['PTS_FB'] = (player_full['PTS'] * player_full['PCT_PTS_FB']).astype('int8')
        player_full['PTS_OFF_TOV'] = (player_full['PTS'] * player_full['PCT_PTS_OFF_TOV']).astype('int8')
        player_full['PTS_PAINT'] = (player_full['PTS'] * player_full['PCT_PTS_PAINT']).astype('int8')
        player_full['AST_2PM'] = (player_full['FG2M'] * player_full['PCT_AST_2PM']).astype('int8')
        player_full['UAST_2PM'] = (player_full['FG2M'] * player_full['PCT_UAST_2PM']).astype('int8')
        player_full['AST_3PM'] = (player_full['FG3M'] * player_full['PCT_AST_3PM']).astype('int8')
        player_full['UAST_3PM'] = (player_full['FG3M'] * player_full['PCT_UAST_3PM']).astype('int8')



        player_full = player_full.drop(columns = ['PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 
                                                  'PCT_PTS_FB', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT',
                                                  'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                                                  'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 
                                                  'FGM', 'FGA', 'AST_TOV', 'USG_PCT', 'PACE',
                                                  'PACE_PER40', ], errors='ignore')
        
        
        full_player_data.append(player_full)
        
        
    
    player_df = pd.concat(full_player_data)
    
    
    player_df['home_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 1 else row['MATCHUP'][-3:], axis=1)
    player_df['away_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 0 else row['MATCHUP'][-3:], axis=1)
    
    player_df[['home_team_abbr', 'away_team_abbr']] = player_df[['home_team_abbr', 'away_team_abbr']].replace({'NOH':'NOP',
                                                                                                               'NJN':'BKN'})
    
    # Reorder columns
    player_df = player_df[['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID', 
                           'home_team_abbr', 'away_team_abbr', 'GAME_ID', 
                           'GAME_DATE', 'MATCHUP', 'WL', 'HOME_GAME', 'MIN',
                           'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
                           'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD',
                           'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'E_DEF_RATING',
                           'AST_RATIO', 'TM_TOV_PCT', 'E_USG_PCT',
                           'E_PACE', 'POSS', 'PIE',
                           'FG2M', 'FG2A', 'PTS_2PT_MR', 'PTS_FB', 
                           'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
                           'UAST_2PM', 'AST_3PM', 'UAST_3PM']]
    
    return player_df

player_df = gather_data(seasons)

['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21']
player_gls_shape: (25153, 64) player_gls_adv_shape: (31419, 31) player_gls_scoring_shape: (29475, 24)
after_merging shape (25153, 83)
player_gls_shape: (20758, 64) player_gls_adv_shape: (27638, 31) player_gls_scoring_shape: (25455, 24)
after_merging shape (20758, 83)
player_gls_shape: (25757, 64) player_gls_adv_shape: (33542, 31) player_gls_scoring_shape: (31338, 24)
after_merging shape (25757, 83)
player_gls_shape: (25618, 64) player_gls_adv_shape: (33681, 31) player_gls_scoring_shape: (31378, 24)
after_merging shape (25618, 83)
player_gls_shape: (25981, 64) player_gls_adv_shape: (33522, 31) player_gls_scoring_shape: (31412, 24)
after_merging shape (25981, 83)
player_gls_shape: (26078, 64) player_gls_adv_shape: (33659, 31) player_gls_scoring_shape: (31423, 24)
after_merging shape (26078, 83)
player_gls_shape: (26139, 64) player_gls_adv_shape: (33610, 31) player_gls

In [88]:
player_df

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
0,2010-11,MIL,Corey Maggette,1894,OKC,MIL,21001225,2011-04-13,MIL @ OKC,1,0,18.776667,0,0,6,6,0,1,2,2,1,0,0,2,3,12,2,111.0,105.4,16.7,16.7,0.239,96.89,39,0.139,3,5,3,2,0,2,3,0,0,0
1,2010-11,NOH,Jarrett Jack,101127,DAL,NOP,21001223,2011-04-13,NOH @ DAL,0,0,28.658333,1,1,5,6,0,0,7,1,0,1,0,2,4,22,-22,100.3,137.0,28.0,4.0,0.296,95.91,56,0.144,7,14,6,2,0,8,2,4,1,0
2,2010-11,NJN,Jordan Farmar,200770,CHI,BKN,21001222,2011-04-13,NJN @ CHI,0,0,38.166667,2,6,5,6,0,4,12,4,0,2,0,1,4,21,7,107.9,102.8,35.3,11.8,0.274,91.28,71,0.198,5,9,3,1,6,6,0,5,1,1
3,2010-11,NYK,Amar'e Stoudemire,2405,BOS,NYK,21001216,2011-04-13,NYK @ BOS,0,0,20.450000,0,0,2,3,2,2,1,2,0,1,0,1,4,14,-3,107.6,107.5,5.3,10.5,0.384,101.49,40,0.072,6,15,4,0,0,7,1,4,0,0
4,2010-11,OKC,Byron Mullens,201957,OKC,MIL,21001225,2011-04-13,OKC vs. MIL,0,1,29.178333,0,0,2,4,1,4,0,2,1,0,1,2,2,10,-22,90.1,130.8,0.0,15.4,0.191,97.12,60,0.041,4,9,0,1,4,8,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8689,2020-21,GSW,Jordan Poole,1629673,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,17.916667,0,1,0,0,0,2,3,2,0,0,0,2,1,2,-3,83.8,94.8,33.3,22.2,0.135,103.63,38,0.000,1,3,0,0,0,2,0,1,0,0
8690,2020-21,GSW,Andrew Wiggins,203952,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,31.233333,2,6,3,4,0,2,1,4,0,1,2,4,4,13,-28,74.4,115.8,4.3,17.4,0.262,111.88,72,-0.044,2,10,2,0,0,2,1,1,2,0
8691,2020-21,GSW,Juan Toscano-Anderson,1629308,BKN,GSW,22000001,2020-12-22,GSW @ BKN,0,0,13.250000,0,0,0,0,0,4,2,0,0,2,0,3,1,4,-9,70.5,95.5,50.0,0.0,0.058,119.84,33,0.208,2,2,0,2,0,4,2,0,0,0
8692,2020-21,LAC,Luke Kennard,1628379,LAL,LAC,22000002,2020-12-22,LAC @ LAL,1,0,21.055000,0,3,0,0,0,2,2,1,2,1,0,3,0,4,-3,116.9,112.2,22.2,11.1,0.144,104.59,44,0.032,2,3,0,0,0,4,1,1,0,0


In [292]:
def aggregate_player_stats(df = player_df):

    seasons = player_df['SEASON_YEAR'].unique()

    df = df.sort_values('GAME_DATE')

    for season in seasons:
        season_df = df.loc[df['SEASON_YEAR'] == season]
        player_ids = season_df['PLAYER_ID'].unique()
        for p_id in tqdm(player_ids, desc='Progress'):
            player_log = season_df.loc[season_df['PLAYER_ID'] == p_id]

            avg_player_data = player_log.iloc[:, 11:].shift(1).rolling(10, min_periods=5).mean()

            cols = avg_player_data.columns

            df.loc[(df['SEASON_YEAR'] == season) & (df['PLAYER_ID'] == p_id), cols] = avg_player_data

    df.to_csv('player_avg_last10.csv', index=False)
        
    return df
    
player_stats = aggregate_player_stats(df=player_df)

Progress: 100%|██████████████████████████████████████████████████████████████████████| 452/452 [00:47<00:00,  9.52it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 478/478 [00:54<00:00,  8.72it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 469/469 [01:01<00:00,  7.61it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 482/482 [00:53<00:00,  8.97it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 492/492 [00:52<00:00,  9.34it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 476/476 [00:52<00:00,  9.00it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 486/486 [00:53<00:00,  9.07it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 540/540 [00:55<00:00,  9.65it/s]
Progress: 100%|█████████████████████████

In [293]:
player_stats.loc[player_stats['GAME_ID']==21000075].sort_values('MIN', ascending=False)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
23603,2010-11,MIA,LeBron James,2544,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,34.323333,1.0,3.4,6.2,8.0,0.0,4.6,7.2,5.2,1.4,0.8,0.0,1.8,4.4,20.4,14.0,109.96,83.66,24.92,16.64,0.3224,94.648,66.8,0.1922,5.6,10.6,3.8,4.6,4.8,7.2,1.8,3.4,1.0,0.0
23527,2010-11,MIA,Chris Bosh,2547,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.736667,0.0,0.2,3.8,4.6,1.0,5.4,1.6,0.8,1.0,1.0,0.8,2.2,2.8,13.0,15.0,117.26,87.02,10.52,4.7,0.2164,92.952,60.6,0.1264,4.6,10.6,4.0,0.0,2.0,4.6,2.6,1.6,0.0,0.0
23563,2010-11,MIA,Dwyane Wade,2548,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.570667,1.0,2.6,5.8,8.0,1.2,3.8,4.2,3.0,1.6,1.0,1.0,2.8,5.4,22.4,10.8,113.96,90.06,15.52,10.78,0.3498,93.192,60.0,0.1774,6.8,13.8,1.6,4.8,5.6,11.6,1.8,4.2,0.2,0.8
23730,2010-11,MIA,James Jones,2592,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,27.129,3.6,6.8,0.0,0.0,0.2,2.2,0.8,0.0,0.6,0.4,0.0,2.0,1.2,11.6,6.2,105.4,91.18,10.04,0.0,0.1364,94.588,53.6,0.1214,0.4,1.0,0.8,1.0,2.4,0.0,0.4,0.0,3.6,0.0
23685,2010-11,MIA,Carlos Arroyo,2306,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,23.359667,0.4,0.8,1.2,1.4,0.4,3.2,2.4,1.0,0.6,0.0,0.0,1.4,0.8,7.2,13.4,111.9,82.1,23.26,9.62,0.1394,91.746,43.8,0.1214,2.4,4.6,4.2,0.2,0.8,0.2,1.4,0.6,0.4,0.0
23585,2010-11,MIA,Udonis Haslem,2617,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.872,0.0,0.0,1.6,2.0,1.8,7.0,0.2,0.8,0.4,0.4,0.4,3.6,2.4,7.2,2.8,98.2,91.04,2.22,12.52,0.151,96.178,45.8,0.1274,2.8,5.8,3.6,0.8,1.0,1.6,2.4,0.4,0.0,0.0
23712,2010-11,MIA,Eddie House,2067,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.689667,2.0,3.2,1.0,1.2,0.0,1.8,1.8,0.8,1.4,0.0,0.2,1.4,0.6,9.0,-0.2,98.72,98.4,19.54,7.78,0.1596,93.754,44.4,0.1304,1.0,3.4,1.8,1.0,0.6,0.0,0.6,0.4,1.8,0.2
23518,2010-11,MIA,Joel Anthony,201202,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,20.329333,0.0,0.0,0.6,1.2,1.2,3.2,0.6,0.4,0.4,1.4,0.0,3.4,1.0,1.4,7.8,101.32,76.84,20.0,30.0,0.0406,91.318,37.6,0.054,0.4,0.6,0.0,0.4,0.2,0.8,0.2,0.2,0.0,0.0
23728,2010-11,MIA,Zydrunas Ilgauskas,980,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,14.649667,0.0,0.0,0.8,1.2,2.2,2.0,0.6,0.6,0.6,0.8,0.0,3.6,1.0,6.0,10.2,127.04,85.52,7.3,15.56,0.19,97.444,29.8,0.0874,2.6,4.8,3.6,0.0,0.6,1.4,2.0,0.4,0.0,0.0
23581,2010-11,NOH,Chris Paul,101108,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [2]:
player_stats1 = pd.read_csv('player_l10_avg_part1.csv', parse_dates=['GAME_DATE'])
player_stats2 = pd.read_csv('player_l10_avg_part2.csv', parse_dates=['GAME_DATE'])

player_stats = pd.concat([player_stats1, player_stats2])

In [3]:
player_stats = player_stats.sort_values('GAME_DATE')
player_stats

Unnamed: 0,SEASON_YEAR,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,HOME_GAME,WL,MIN,FG2M,FG2A,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,PACE_PER40,POSS,PIE,PTS_2PT,PTS_2PT_MR,PTS_3PT,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
0,2000-01,393,Rod Strickland,WAS,WAS,BKN,20000063,2000-11-08,WAS vs. NJN,1,0,33.932000,5.2,11.2,0.0,0.2,5.0,6.8,0.4,3.4,7.2,2.0,1.8,0.4,1.0,1.4,0.0,15.4,3.2,94.30,91.38,29.20,8.78,0.2196,79.114,67.0,0.1900,10.4014,4.3968,0.000000e+00,3.4004,1.9990,5.9968,2.4006,2.7994,0.0000,0.0000
19,2000-01,1517,Bobby Jackson,SAC,SAC,GSW,20000071,2000-11-08,SAC vs. GSW,1,1,37.470000,3.2,7.6,0.4,2.2,3.8,4.4,0.8,4.4,4.4,2.2,2.0,0.2,0.4,2.6,0.0,11.4,0.0,93.02,92.42,26.64,10.76,0.1636,77.442,72.8,0.1114,6.3984,2.8004,1.199800e+00,4.2000,4.2000,3.6008,0.9996,2.2004,0.2000,0.2000
20,2000-01,98,Nick Anderson,SAC,SAC,GSW,20000071,2000-11-08,SAC vs. GSW,1,1,7.110000,0.0,0.6,0.6,1.0,0.0,0.0,0.4,0.6,0.0,0.2,0.6,0.2,0.4,0.2,0.0,1.8,2.0,101.58,92.32,0.00,20.00,0.1124,77.998,14.2,0.1032,0.0000,0.0000,1.800000e+00,0.0000,0.0000,0.0000,0.0000,0.0000,0.6000,0.0000
21,2000-01,1727,Pat Garrity,ORL,ORL,CHH,20000064,2000-11-08,ORL vs. CHH,1,0,23.225333,2.2,4.8,1.0,3.8,1.4,1.4,0.2,1.6,1.0,1.0,0.6,0.2,0.2,3.0,0.0,8.8,0.6,91.72,94.00,9.04,7.66,0.1870,82.008,47.2,0.0342,4.4022,3.1980,2.999800e+00,0.6000,2.2018,1.1986,0.9998,1.2002,1.0000,0.0000
22,2000-01,270,Horace Grant,LAL,SAS,LAL,20000068,2000-11-08,LAL @ SAS,0,0,36.589333,2.8,6.4,0.0,0.2,1.4,2.0,3.6,4.8,0.8,0.6,0.2,0.4,0.2,3.4,0.0,7.0,5.2,100.00,94.52,6.82,4.60,0.1020,75.982,69.4,0.0592,5.6006,1.6018,0.000000e+00,0.4000,0.4000,3.9988,2.3998,0.4002,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244706,2020-21,1628389,Bam Adebayo,MIA,SAC,MIA,22000447,2021-02-18,MIA @ SAC,0,1,35.135167,7.5,13.3,0.0,0.0,4.7,5.8,2.2,7.2,5.0,2.8,1.3,1.4,0.9,2.9,5.8,19.7,0.0,108.30,107.61,20.64,11.56,0.2393,81.570,71.7,0.1614,14.9998,1.9981,4.440892e-17,1.6002,2.9999,13.0017,4.7002,2.7998,0.0000,0.0000
244707,2020-21,1629130,Duncan Robinson,MIA,SAC,MIA,22000447,2021-02-18,MIA @ SAC,0,1,31.558833,0.7,1.1,3.0,8.3,0.6,0.8,0.0,3.3,1.2,1.3,0.4,0.1,0.0,2.9,0.9,11.0,1.2,103.91,102.29,9.17,10.24,0.1610,80.680,63.5,0.0523,1.4010,0.0000,8.999000e+00,1.1995,1.6001,1.4010,0.6000,0.1000,3.0000,0.0000
244708,2020-21,1630169,Tyrese Haliburton,SAC,SAC,MIA,22000447,2021-02-18,SAC vs. MIA,1,0,31.138667,3.0,4.8,2.6,6.1,0.5,0.6,1.3,3.2,5.6,1.8,1.3,0.4,0.1,1.2,0.7,14.3,0.7,114.08,114.59,30.09,9.38,0.1791,85.265,66.8,0.1232,5.9998,1.4000,7.800000e+00,1.7998,1.4007,4.6020,0.9998,2.0002,2.3001,0.2999
244694,2020-21,203507,Giannis Antetokounmpo,MIL,MIL,TOR,22000445,2021-02-18,MIL vs. TOR,1,0,34.527833,9.9,15.3,0.8,2.9,7.2,10.0,1.7,10.4,6.3,3.1,1.5,1.7,0.9,2.5,8.3,29.4,6.7,119.39,108.35,20.76,9.79,0.3133,86.329,74.1,0.2069,19.7979,0.8029,2.397400e+00,5.9008,4.4984,19.0030,4.2016,5.6984,0.1000,0.7000


In [768]:
set(player_stats['GAME_ID'].unique()) - set(all_team_gamelogs['GAME_ID'].unique()) 

{22000427,
 22000429,
 22000430,
 22000431,
 22000432,
 22000433,
 22000436,
 22000437,
 22000438,
 22000439,
 22000440,
 22000441,
 22000443,
 22000444,
 22000445,
 22000446,
 22000447,
 22000562,
 22000564}

In [5]:
seasons = ["200{}-0{}".format(x, x+1) if x!=9 else "200{}-{}".format(x, x+1) for x in range(0, 10)]
seasons2 = ["20{}-{}".format(x, x+1) for x in range(10, 21)]
seasons.extend(seasons2)
print(seasons)

def prep_data_for_model(df = player_stats):
    matchup_info = []
    matchups = []
    targets = []
    
    all_gamelogs = []
    for season in seasons:
        gl = pd.read_csv("./data/basic_team_boxscores/team_gamelogs_{}.csv".format(season))
        all_gamelogs.append(gl)

    all_team_gamelogs = pd.concat(all_gamelogs)
    game_results = all_team_gamelogs.loc[all_team_gamelogs['MATCHUP'].str.contains('vs'), ['GAME_ID', 'PLUS_MINUS']]

    game_ids = set(player_stats['GAME_ID'].unique()) & set(all_team_gamelogs['GAME_ID'].unique()) 
    
    for game_id in tqdm(game_ids, desc="progress"):
        game_df = df.loc[df['GAME_ID'] == game_id]
   
        home = game_df.loc[game_df['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
        home_stats = home.iloc[:10, 11:].values
        
        away = game_df.loc[game_df['HOME_GAME'] == 0].sort_values('MIN', ascending=False)
        away_stats = away.iloc[:10, 11:].values
                
        # If a team had one of their game's removed (because it was in the first 5 games of the season), remove the entire game
        if (home_stats.shape[0] == 0) or (away_stats.shape[0] == 0):
            continue
        
        if home_stats.shape[0] < 10:
            missing_rows = 10 - home_stats.shape[0]
            padding = np.zeros((missing_rows, home_stats.shape[1]))
            home_stats = np.concatenate([home_stats, padding], axis=0)
        
        if away_stats.shape[0] < 10:
            missing_rows = 10 - away_stats.shape[0]
            padding = np.zeros((missing_rows, away_stats.shape[1]))
            away_stats = np.concatenate([away_stats, padding], axis=0)

        matchup = np.stack([home_stats, away_stats], axis=0)
        matchups.append(matchup)
        
        point_diff = game_results.loc[game_results['GAME_ID'] == game_id, 'PLUS_MINUS'].values[0]
        targets.append(point_diff)
        
        matchup_info.append(home[['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL']].iloc[0].values)
        
    matchup_features = np.stack(matchups, axis=0)
    matchup_info = pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL'])
    
    return matchup_info, matchup_features, np.array(targets).reshape(-1, 1)

matchup_info, matchup_features, targets = prep_data_for_model(df = player_stats)

['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21']


progress: 100%|█████████████████████████████████████████████████████████████████| 24611/24611 [02:00<00:00, 203.50it/s]


In [6]:
matchup_info.shape, matchup_features.shape, targets.shape

((24415, 6), (24415, 2, 10, 36), (24415, 1))

In [7]:
pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL'])

Unnamed: 0,SEASON_YEAR,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,WL
0,2015-16,CHI,OKC,21500070,2015-11-05,1
1,2015-16,POR,MEM,21500074,2015-11-05,1
2,2015-16,ORL,TOR,21500075,2015-11-06,1
3,2015-16,NYK,MIL,21500079,2015-11-06,0
4,2015-16,IND,MIA,21500080,2015-11-06,1
...,...,...,...,...,...,...
24410,2005-06,DAL,MIA,40500402,2006-06-11,1
24411,2005-06,MIA,DAL,40500403,2006-06-13,1
24412,2005-06,MIA,DAL,40500404,2006-06-15,1
24413,2005-06,MIA,DAL,40500405,2006-06-18,1


In [8]:
home_cols = ["home_"+x for x in player_stats.columns[11:]]
away_cols = ["away_"+x for x in player_stats.columns[11:]]

cols = home_cols
pd.DataFrame(matchup_features[1011][0], columns=cols)


Unnamed: 0,home_MIN,home_FG2M,home_FG2A,home_FG3M,home_FG3A,home_FTM,home_FTA,home_OREB,home_DREB,home_AST,home_TOV,home_STL,home_BLK,home_BLKA,home_PF,home_PFD,home_PTS,home_PLUS_MINUS,home_E_OFF_RATING,home_E_DEF_RATING,home_AST_RATIO,home_TM_TOV_PCT,home_E_USG_PCT,home_PACE_PER40,home_POSS,home_PIE,home_PTS_2PT,home_PTS_2PT_MR,home_PTS_3PT,home_PTS_FB,home_PTS_OFF_TOV,home_PTS_PAINT,home_AST_2PM,home_UAST_2PM,home_AST_3PM,home_UAST_3PM
0,38.120667,6.7,12.7,1.6,6.3,4.6,5.4,0.4,2.7,5.5,3.5,0.8,0.5,1.1,3.5,5.4,22.8,-7.2,98.07,108.68,17.96,11.58,0.277,83.894,79.8,0.0994,13.4019,5.6007,4.8,5.2996,4.0025,7.8027,2.2,4.5,1.1001,0.4999
1,35.949833,3.8,9.6,2.8,8.0,2.2,2.5,1.1,3.9,4.7,3.9,0.9,0.7,1.0,2.7,3.6,18.2,-6.6,95.89,107.13,17.35,14.18,0.2633,84.516,76.0,0.0835,7.6012,1.7999,8.3993,3.7,2.7016,5.8013,1.1003,2.6997,1.6997,1.1003
2,32.035167,2.3,5.0,0.8,2.8,1.5,2.2,2.6,4.0,2.0,1.3,1.3,0.3,0.8,2.3,1.7,8.5,-2.8,98.37,104.93,15.93,11.49,0.1277,83.78,67.5,0.063,4.6009,0.5995,2.4002,1.3002,1.8015,4.0015,0.8998,1.4002,0.8,0.0
3,29.309167,4.1,11.9,0.0,0.2,3.5,4.6,2.8,7.5,2.5,2.8,0.3,0.7,1.6,3.3,3.8,11.7,-7.4,95.72,108.83,12.25,14.9,0.243,82.971,60.7,0.0733,8.1982,3.0019,0.0,4.4408920000000007e-17,1.2008,5.2018,3.2,0.9,0.0,0.0
4,24.606,1.7,4.0,2.2,6.0,1.5,1.9,0.5,3.6,1.2,0.8,0.4,0.2,0.5,2.6,2.2,11.5,-2.8,108.15,111.6,10.78,5.63,0.207,84.547,52.2,0.0801,3.4002,1.6013,6.5995,2.2009,0.9998,1.7989,1.2001,0.4999,2.0,0.2
5,22.055,0.7,1.4,1.3,3.7,0.3,0.8,0.5,1.5,2.7,0.7,0.5,0.0,0.1,2.5,1.2,5.6,-3.7,95.78,108.61,24.97,6.84,0.1171,83.561,44.7,0.0309,1.4007,0.2,3.8999,0.4004,0.7003,1.2007,0.0,0.7,1.2001,0.0999
6,16.334167,2.3,4.6,0.1,1.4,1.3,2.5,0.5,2.5,1.5,1.3,0.1,0.1,0.7,1.6,2.0,6.2,-5.8,94.64,110.04,14.02,13.99,0.2163,87.489,36.0,0.0617,4.6,0.0,0.3,1.8998,0.6998,4.6,0.5999,1.7001,0.1,0.0
7,13.576667,1.3,2.9,0.7,1.4,0.6,0.8,0.4,1.7,1.1,0.9,0.0,0.0,0.3,0.4,0.5,5.3,-4.5,100.01,113.21,16.58,15.0,0.1976,85.339,29.4,0.0814,2.5997,1.5997,2.0998,0.8001,0.4003,1.0,0.9,0.4,0.7,0.0
8,11.629,1.0,1.7,0.2,1.2,0.3,0.4,0.7,1.2,1.0,0.3,0.3,0.0,0.1,1.3,0.4,2.9,-3.3,102.53,117.99,17.72,16.11,0.1319,80.978,23.9,0.0638,2.0003,0.2,0.6,0.9,0.4998,1.8003,0.6,0.4,0.2,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Split up data - training set and testing set
X_train, X_test, y_train, y_test = train_test_split(matchup_features, targets, test_size=0.2)


print("X_train shape:", X_train.shape,
      "\nX_test shape:", X_test.shape,
      "\ny_test shape:", y_train.shape,
      "\ny_train shape:", y_test.shape)

X_train shape: (19532, 2, 10, 36) 
X_test shape: (4883, 2, 10, 36) 
y_test shape: (19532, 1) 
y_train shape: (4883, 1)


X_train_tensor shape: torch.Size([64, 2, 10, 36]) 
X_test_tensor shape: torch.Size([4883, 2, 10, 36]) 
y_test_tensor shape: torch.Size([64, 1]) 
y_train_tensor shape: torch.Size([4883, 1])


In [92]:
class PlayerBoxScoreMatchupsDataset(Dataset):
    """"""
    def __init__(self, matchup_features, targets):
        self.targets = targets
        self.matchup_data = matchup_features
        
    def __len__(self):
        return len(self.matchup_data)
    
    def __getitem__(self, index):
        X = self.matchup_data[index]
        y = self.targets[index]
        
        return X, y
    

In [153]:
# Hyperparameters
batch_size = 16

# Create DataSet and DataLoaders

training_set = PlayerBoxScoreMatchupsDataset(X_train, y_train)
validation_set = PlayerBoxScoreMatchupsDataset(X_test, y_test)

train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

# print(data, target)
# data = torch.rand(1, 2, 10, 34)
# data

In [154]:
data, target = next(iter(train_loader))
print(data.shape, target.shape)

torch.Size([16, 2, 10, 36]) torch.Size([16, 1])


In [195]:
batch_size = 40

X_train_tensor = torch.from_numpy(X_train).float()[:batch_size]
y_train_tensor = torch.from_numpy(y_train).float()[:batch_size]
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).float()

print("X_train_tensor shape:", X_train_tensor.shape,
      "\nX_test_tensor shape:", X_test_tensor.shape,
      "\ny_test_tensor shape:", y_train_tensor.shape,
      "\ny_train_tensor shape:", y_test_tensor.shape)

X_train_tensor shape: torch.Size([40, 2, 10, 36]) 
X_test_tensor shape: torch.Size([4883, 2, 10, 36]) 
y_test_tensor shape: torch.Size([40, 1]) 
y_train_tensor shape: torch.Size([4883, 1])


In [196]:
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F

batch_size = 40

X_train_tensor = torch.from_numpy(X_train).float()[:batch_size]
y_train_tensor = torch.from_numpy(y_train).float()[:batch_size]

print("X_train_tensor shape:", X_train_tensor.shape,
      "\nX_test_tensor shape:", X_test_tensor.shape)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv1d(2, 2, kernel_size=(10,1), stride=1)
#         self.conv1_bn = nn.BatchNorm2d(2)
        
        self.fc1 = nn.Linear(72, 32)
#         self.fc1_bn = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(32, 1)
# # #         self.fc2_bn = nn.BatchNorm1d(16)
#         self.fc3 = nn.Linear(32, 1)
        
#         self.dropout = nn.Dropout(0.1)
        
        
    def forward(self, x):
        
        x = F.tanh(self.conv1(x))
#         print("shape after conv1:", x.shape)
        x = x.reshape(-1, 72)
#         x = self.dropout(x)
        x = F.tanh(self.fc1(x))
#         x = self.dropout(x)
        x = self.fc2(x)
# # #         x = self.dropout(x)
#         x = self.fc3(x)
        
        return x
    

import torch.optim as optim

net = Net()
loss_function = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.1)

epochs = 100
batch_size = 64

train_losses, test_losses = [], []

for e in range(epochs):
    train_loss = 0
    test_loss = 0
    net.train()
    
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        
        net.zero_grad()
        
        
# #     for i, (data, target) in enumerate(train_loader):
#     optimizer.zero_grad()

        output = net(batch_X)
# #     print("output:", output)
# #     print("target:", target)

        loss= loss_function(output, batch_y)

#     if e == 49:
# #         print("input 1", data[0][0][0], "\ninput 2:", data[1][0][0])
#         print("outputs:", output.flatten(), "output shape:", output.shape)
#         print("targets:", target.flatten(), "target shape:", target.shape)
#         print("loss:", loss)

        loss.backward()
        optimizer.step()
    print(loss)
print(output, batch_y)
#     train_loss += loss.item()
#     print(train_loss)

#     else:
#         with torch.no_grad():
#             net.eval()
#             for data, target in val_loader:
#                 output = net(data.float())
#                 loss = loss_function(output, target.float())

#                 test_loss += loss.item()

#     train_losses.append(train_loss / len(train_loader))
#     test_losses.append(test_loss / len(val_loader))

#     print('Epoch: {} \tTraining Loss: {:.4f} \tValidation Loss: {:.4f}'.format(e, train_loss, test_loss))
    


tensor(213.1673, grad_fn=<MseLossBackward>)
tensor(229.0601, grad_fn=<MseLossBackward>)
tensor(210.5667, grad_fn=<MseLossBackward>)
tensor(213.9592, grad_fn=<MseLossBackward>)
tensor(211.8374, grad_fn=<MseLossBackward>)
tensor(208.3643, grad_fn=<MseLossBackward>)
tensor(207.9284, grad_fn=<MseLossBackward>)
tensor(209.0499, grad_fn=<MseLossBackward>)
tensor(210.0125, grad_fn=<MseLossBackward>)
tensor(209.7631, grad_fn=<MseLossBackward>)
tensor(208.7255, grad_fn=<MseLossBackward>)
tensor(207.9297, grad_fn=<MseLossBackward>)
tensor(207.9661, grad_fn=<MseLossBackward>)
tensor(208.5643, grad_fn=<MseLossBackward>)
tensor(208.9921, grad_fn=<MseLossBackward>)
tensor(208.8299, grad_fn=<MseLossBackward>)
tensor(208.2889, grad_fn=<MseLossBackward>)
tensor(207.8820, grad_fn=<MseLossBackward>)
tensor(207.9131, grad_fn=<MseLossBackward>)
tensor(208.2384, grad_fn=<MseLossBackward>)
tensor(208.4673, grad_fn=<MseLossBackward>)
tensor(208.3708, grad_fn=<MseLossBackward>)
tensor(208.0688, grad_fn=<MseLos

In [215]:
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

training_size = 39

X_train_tensor = torch.from_numpy(X_train).float()[:training_size]
y_train_tensor = torch.from_numpy(y_train).float()[:training_size]

print("X_train_tensor shape:", X_train_tensor.shape,
      "\nX_test_tensor shape:", y_test_tensor.shape,
     "\n")

# Model Architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv1d(2, 2, kernel_size=(10,1), stride=1)       
        self.fc1 = nn.Linear(72, 32)
        self.fc2 = nn.Linear(32, 1)        
#         self.fc3 = nn.Linear(16, 1)        
        
    def forward(self, x):
        
        x = F.tanh(self.conv1(x))
        x = x.reshape(-1, 72)
        x = F.tanh(self.fc1(x))
        x = self.fc2(x)     
#         x = self.fc3(x)
        return x
    

# Instantiating Model
net = Net()
loss_function = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.1)
batch_size=39
epochs = 300

# Training Loop
for e in range(epochs):
    net.train()
    
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        
        net.zero_grad()

        output = net(batch_X)


        loss= loss_function(output, batch_y)

        loss.backward()
        optimizer.step()
        
        if e == 299:
            print("model predictions:\n", output.detach().numpy().flatten(), "\nGround Truth:\n", batch_y.detach().numpy().flatten())   

    if e %10 == 0:
        print("loss:", loss)
        
        

X_train_tensor shape: torch.Size([39, 2, 10, 36]) 
X_test_tensor shape: torch.Size([4883, 1]) 

loss: tensor(217.2352, grad_fn=<MseLossBackward>)
loss: tensor(175.0472, grad_fn=<MseLossBackward>)
loss: tensor(140.7588, grad_fn=<MseLossBackward>)
loss: tensor(89.4915, grad_fn=<MseLossBackward>)
loss: tensor(53.4941, grad_fn=<MseLossBackward>)
loss: tensor(31.1177, grad_fn=<MseLossBackward>)
loss: tensor(24.9193, grad_fn=<MseLossBackward>)
loss: tensor(20.5059, grad_fn=<MseLossBackward>)
loss: tensor(17.9696, grad_fn=<MseLossBackward>)
loss: tensor(16.7175, grad_fn=<MseLossBackward>)
loss: tensor(15.7527, grad_fn=<MseLossBackward>)
loss: tensor(14.9803, grad_fn=<MseLossBackward>)
loss: tensor(14.3307, grad_fn=<MseLossBackward>)
loss: tensor(13.7107, grad_fn=<MseLossBackward>)
loss: tensor(12.4485, grad_fn=<MseLossBackward>)
loss: tensor(11.5541, grad_fn=<MseLossBackward>)
loss: tensor(10.7076, grad_fn=<MseLossBackward>)
loss: tensor(10.0028, grad_fn=<MseLossBackward>)
loss: tensor(37.453

In [210]:
batch_y.mean()

tensor(2.1000)

In [117]:
for data, label, in train_loader:
    print(net(data.float()))
    print(label)

tensor([[-0.3911],
        [-0.2481],
        [-0.0075],
        [-0.2919],
        [-0.4059],
        [-0.3946],
        [-0.2501],
        [-0.4465],
        [-0.3268],
        [-0.4537],
        [-0.1429],
        [-0.0252],
        [-0.3113],
        [-0.4883],
        [-0.0760],
        [-0.0150],
        [-0.3851],
        [-0.3224],
        [-0.1760],
        [-0.3401],
        [-0.3434],
        [-0.3318],
        [-0.2776],
        [-0.3213],
        [-0.3243],
        [-0.1160],
        [-0.2266],
        [-0.0481],
        [-0.2054],
        [-0.3500],
        [-0.4690],
        [-0.2041]], grad_fn=<AddmmBackward>)
tensor([[ 15],
        [-10],
        [-21],
        [ 20],
        [ -8],
        [-13],
        [ 25],
        [ -2],
        [  9],
        [ 15],
        [  7],
        [ 11],
        [-11],
        [ -7],
        [ -8],
        [ -3],
        [ -3],
        [-18],
        [ -5],
        [  6],
        [ 12],
        [ 21],
        [  7],
        [-26],
      

In [None]:
print()

In [352]:
 np.empty((1,0))

array([], shape=(1, 0), dtype=float64)

In [648]:
preds = np.empty((1,0))
for data, target in val_loader:
    
    output = net(data.float())
    preds  = np.append(preds, output.detach().numpy().T)

    
    
preds
    

array([ 0.81842214,  0.4361662 ,  0.91527337, ..., -0.16868234,
        0.06047469,  0.06885672])

In [649]:
pd.DataFrame(preds)

Unnamed: 0,0
0,0.818422
1,0.436166
2,0.915273
3,-0.379051
4,-0.287050
...,...
2280,0.641530
2281,0.032958
2282,-0.168682
2283,0.060475


## Gather Betting Data

In [359]:
# read betting data
betting_data = pd.read_csv('./data/nba_betting_data_2010_present.csv')
betting_data = betting_data.dropna()
    
def clean_bet_data(df = betting_data):

    
    df['date'] = pd.to_datetime(df['date'])
    df['home_team_abbr'] = df['home_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'
                                                                                        }
                                                                             )
    df['away_team_abbr'] = df['away_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'}
                                                                           )

    teams = df['home_team_abbr'].unique()

    df = df.sort_values(['date'])
    df['rest'] = np.nan
    for team in teams:
        team_data = df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team)]
        team_data['rest'] = (team_data['date'] - team_data['date'].shift(1)) / np.timedelta64(1, 'D')
        df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team), 'rest'] = team_data['rest']


    df['point_diff'] = df['home_score'] - df['away_score']
    df['point_total'] = df['home_score'] + df['away_score']
    df['covered_spread'] = (df['home_score'] + df['spread'] > df['away_score']).astype(int)
    df['over'] = (df['point_total'] > df['total']).astype(int)
    
    df['prev_cover'] = df['covered_spread'].shift(1)
    df['prev2_cover'] = df['covered_spread'].shift(2)
    
    relevant_betting = df[['date', 'home_team_abbr',  'away_team_abbr',
                           'home_score', 'away_score', 'point_diff', 
                           'rest', 'point_total', 'spread', 'total',
                          'covered_spread', 'over', 'prev_cover', 'prev2_cover']]

    return relevant_betting

bet_data_clean = clean_bet_data()

array([ 12, -19,  12, ...,   1,  11,   8], dtype=int64)

In [360]:
bet_data_clean[['']]

Unnamed: 0,date,home_team_abbr,away_team_abbr,home_score,away_score,point_diff,rest,point_total,spread,total,covered_spread,over,prev_cover,prev2_cover
1012,2011-12-25,NYK,BOS,106,104,2,,210,-5.0,190.5,0,1,,
1013,2011-12-25,DAL,MIA,94,105,-11,,199,4.5,188.5,0,1,0.0,
1014,2011-12-25,LAL,CHI,87,88,-1,,175,4.5,183.5,1,0,0.0,0.0
1015,2011-12-25,OKC,ORL,97,89,8,,186,-7.5,194.5,1,0,1.0,0.0
1016,2011-12-25,GSW,LAC,86,105,-19,,191,6.0,207.5,0,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11935,2021-02-04,LAL,DEN,114,93,21,4.0,207,-5.0,217.0,1,0,1.0,1.0
11934,2021-02-04,MEM,HOU,103,115,-12,1.0,218,-3.5,226.0,0,0,1.0,1.0
11933,2021-02-04,PHI,POR,105,121,-16,1.0,226,-10.0,223.0,0,1,0.0,1.0
11932,2021-02-04,DAL,GSW,116,147,-31,2.0,263,-4.0,229.5,0,1,0.0,0.0
