# Convolutional Neural Network to Predict NBA Games

## Imports

In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

from tqdm import tqdm
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.endpoints import leaguegamelog

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

pd.options.display.max_columns=200

In [22]:
player_gls_scoring_03 = pd.read_csv('../data/scoring_player_boxscores/player_scoring_boxscores_2003-04.csv')
player_gls_scoring_03.loc[player_gls_scoring_03['PCT_PTS_2PT'].isnull()]

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,START_POSITION,COMMENT,MIN,PCT_FGA_2PT,PCT_FGA_3PT,PCT_PTS_2PT,PCT_PTS_2PT_MR,PCT_PTS_3PT,PCT_PTS_FB,PCT_PTS_FT,PCT_PTS_OFF_TOV,PCT_PTS_PAINT,PCT_AST_2PM,PCT_UAST_2PM,PCT_AST_3PM,PCT_UAST_3PM,PCT_AST_FGM,PCT_UAST_FGM
11,20300002,1610612756,PHX,Phoenix,2571,Leandro Barbosa,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
21,20300002,1610612759,SAS,San Antonio,1049,Shane Heal,,DND - Sore Lower Back,,,,,,,,,,,,,,,,
22,20300002,1610612759,SAS,San Antonio,2225,Tony Parker,,DND - sprained left ankle,,,,,,,,,,,,,,,,
23,20300002,1610612759,SAS,San Antonio,788,Kevin Willis,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
35,20300001,1610612748,MIA,Miami,2602,Jerome Beasley,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30255,40300402,1610612747,LAL,Los Angeles,935,Bryon Russell,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
30267,40300403,1610612747,LAL,Los Angeles,296,Rick Fox,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
30291,40300404,1610612747,LAL,Los Angeles,2567,Brian Cook,,DNP - Coach's Decision,,,,,,,,,,,,,,,,
30303,40300404,1610612765,DET,Detroit,2246,Mehmet Okur,,DNP - Coach's Decision,,,,,,,,,,,,,,,,


## Gathering Player Data

In [24]:
seasons = ["200{}-0{}".format(x, x+1) if x!=9 else "200{}-{}".format(x, x+1) for x in range(0, 10)]
seasons2 = ["20{}-{}".format(x, x+1) for x in range(10, 20)]
seasons.extend(seasons2)
print(seasons)

def gather_data(seasons = seasons):
    """This function pulls player boxscores from 2000-current season and merges them into one dataframe"""
    full_player_data = []
    for season in seasons:
        player_gls = pd.read_csv('../data/basic_player_boxscores/player_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_adv = pd.read_csv('../data/advanced_player_boxscores/player_advanced_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
        player_gls_scoring = pd.read_csv('../data/scoring_player_boxscores/player_scoring_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'int'})
#         player_gls_tracking = pd.read_csv('./data/player_tracking_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
    
        print("player_gls_shape:", player_gls.shape,
             "player_gls_adv_shape:", player_gls_adv.shape,
             "player_gls_scoring_shape:", player_gls_scoring.shape)
    
        player_gls = player_gls[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK',
       'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']]

        player_full = pd.merge(player_gls, player_gls_adv, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_scoring, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
    
        print("after_merging shape", player_full.shape)
        
        player_full = player_full.dropna(subset=['MIN'])

        # drop unnecessary columns
        player_full = player_full.drop(columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ID_x',
                         'TEAM_ABBREVIATION_x','TEAM_CITY','PLAYER_NAME_x','START_POSITION',
                          'COMMENT','MIN_x','TEAM_ID_x', 'TEAM_ABBREVIATION_x','TEAM_CITY_x', 
                          'PLAYER_NAME_x', 'START_POSITION_x', 'COMMENT_x', 'MIN_x', 
                         'TEAM_ID_x', 'TEAM_ABBREVIATION_x', 'TEAM_CITY_x', 'PLAYER_NAME_x', 
                          'START_POSITION_x', 'COMMENT_x', 'MIN_x', 'AST_x', 'FG_PCT_x',
                         'FG_PCT', 'FG3_PCT', 'FT_PCT', 'E_NET_RATING', 'NET_RATING', 
                         'AST_PCT', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'EFG_PCT', 'TS_PCT', 
                         'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_FT', 'CFG_PCT',
                        'UFG_PCT', 'DFG_PCT'], 
                       errors= 'ignore')
        

        # Convert date to datetime
        player_full['GAME_DATE'] = pd.to_datetime(player_full['GAME_DATE'])
        
        # Convert W/L to a binary 1/0 for win/loss
        player_full['WL'] = (player_full['WL'].str[0] == 'W').astype(int)
        
        # Add a binary home_game feature
        player_full['HOME_GAME'] = 0 
        player_full['HOME_GAME'] = (player_full['MATCHUP'].str[:6] == player_full['TEAM_ABBREVIATION'].str[:] + ' vs').astype(int)
        
        # convert the percentages into raw numbers (because we will be averaging them, we don't want to average percentages)
        # for example, percentage of points from midrange will be points scored from midrange
        player_full['FG2M'] = player_full['FGM'] - player_full['FG3M']
        player_full['FG2A'] = player_full['FGA'] - player_full['FG3A']
        player_full['PTS_2PT'] = (player_full['PTS'] * player_full['PCT_PTS_2PT'])# .astype(float)
        player_full['PTS_2PT_MR'] = (player_full['PTS'] * player_full['PCT_PTS_2PT_MR']) #.astype(float)
        player_full['PTS_3PT'] = (player_full['PTS'] * player_full['PCT_PTS_3PT'])#.astype('int8')
        player_full['PTS_FB'] = (player_full['PTS'] * player_full['PCT_PTS_FB'])#.astype('int8')
        player_full['PTS_OFF_TOV'] = (player_full['PTS'] * player_full['PCT_PTS_OFF_TOV'])#.astype('int8')
        player_full['PTS_PAINT'] = (player_full['PTS'] * player_full['PCT_PTS_PAINT'])#.astype('int8')
        player_full['AST_2PM'] = (player_full['FG2M'] * player_full['PCT_AST_2PM'])#.astype('int8')
        player_full['UAST_2PM'] = (player_full['FG2M'] * player_full['PCT_UAST_2PM'])#.astype('int8')
        player_full['AST_3PM'] = (player_full['FG3M'] * player_full['PCT_AST_3PM'])#.astype('int8')
        player_full['UAST_3PM'] = (player_full['FG3M'] * player_full['PCT_UAST_3PM'])#.astype('int8')



        player_full = player_full.drop(columns = ['PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 
                                                  'PCT_PTS_FB', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT',
                                                  'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                                                  'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 
                                                  'FGM', 'FGA', 'AST_TOV', 'USG_PCT', 'PACE',
                                                  'PACE_PER40', ], errors='ignore')
        
        
        full_player_data.append(player_full)
        
        
    
    player_df = pd.concat(full_player_data)
    
    
    player_df['home_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 1 else row['MATCHUP'][-3:], axis=1)
    player_df['away_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 0 else row['MATCHUP'][-3:], axis=1)
    
    player_df[['home_team_abbr', 'away_team_abbr']] = player_df[['home_team_abbr', 'away_team_abbr']].replace({'NOH':'NOP',
                                                                                                               'NJN':'BKN'})
    
    # Reorder columns
    player_df = player_df[['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID', 
                           'home_team_abbr', 'away_team_abbr', 'GAME_ID', 
                           'GAME_DATE', 'MATCHUP', 'WL', 'HOME_GAME', 'MIN',
                           'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
                           'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD',
                           'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'E_DEF_RATING',
                           'AST_RATIO', 'TM_TOV_PCT', 'E_USG_PCT',
                           'E_PACE', 'POSS', 'PIE',
                           'FG2M', 'FG2A', 'PTS_2PT_MR', 'PTS_FB', 
                           'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
                           'UAST_2PM', 'AST_3PM', 'UAST_3PM']]
    
    return player_df

player_df = gather_data(seasons)

['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']
player_gls_shape: (25367, 64) player_gls_adv_shape: (30140, 31) player_gls_scoring_shape: (30140, 24)
after_merging shape (25367, 83)
player_gls_shape: (25277, 64) player_gls_adv_shape: (30125, 31) player_gls_scoring_shape: (30125, 24)
after_merging shape (25277, 83)
player_gls_shape: (25706, 64) player_gls_adv_shape: (30722, 31) player_gls_scoring_shape: (30722, 24)
after_merging shape (25706, 83)
player_gls_shape: (25548, 64) player_gls_adv_shape: (30328, 31) player_gls_scoring_shape: (30328, 24)
after_merging shape (25548, 83)
player_gls_shape: (26602, 64) player_gls_adv_shape: (31472, 31) player_gls_scoring_shape: (31472, 24)
after_merging shape (26602, 83)
player_gls_shape: (26637, 64) player_gls_adv_shape: (31647, 31) player_gls_scoring_shape: (31647, 24)
after_m

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [39]:
player_df.loc[player_df['PLAYER_NAME'].isnull()]

player_df.loc[21523, 'PLAYER_NAME'] = "Charles Cooke"
player_df.loc[21527, 'PLAYER_NAME'] = "Jalen Jones"
player_df.loc[21542, 'PLAYER_NAME'] = "Lucas Nogueira"
player_df.loc[21547, 'PLAYER_NAME'] = "Alfonzo McKinnie"
player_df.loc[21561, 'PLAYER_NAME'] = "Fred VanVleet"
player_df.loc[21578, 'PLAYER_NAME'] ="Pascal Siakam"
player_df.loc[21592, 'PLAYER_NAME'] = "E'Twaun Moore"
player_df.loc[21594, 'PLAYER_NAME'] = 'Darius Miller'
player_df.loc[21625, 'PLAYER_NAME'] = 'Jameer Nelson'
player_df.loc[21642, 'PLAYER_NAME'] = 'Delon Wright'
player_df.loc[21654, 'PLAYER_NAME'] = 'Jakob Poeltl'
player_df.loc[21707, 'PLAYER_NAME'] = 'Cheick Diallo'

player_df.loc[player_df['PLAYER_NAME'].isnull()]


Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
21687,2017-18,TOR,,101139,NOP,TOR,21700211,2017-11-15,TOR @ NOP,1,0,18.583333,5,9,0,0,1,2,1,1,0,0,0,0,0,17,13,129.7,92.2,7.7,7.7,0.284,103.99,39.0,0.172,1,2,0.0,2.992,2.006,2.006,0.0,1.0,5.0,0.0


In [46]:
player_df = player_df.dropna()

In [47]:
def aggregate_player_stats(df = player_df):

    seasons = player_df['SEASON_YEAR'].unique()

    df = df.sort_values('GAME_DATE')

    for season in seasons:
        season_df = df.loc[df['SEASON_YEAR'] == season]
        player_ids = season_df['PLAYER_ID'].unique()
        for p_id in tqdm(player_ids, desc='Progress'):
            player_log = season_df.loc[season_df['PLAYER_ID'] == p_id]

            avg_player_data = player_log.iloc[:, 11:].shift(1).rolling(10, min_periods=5).mean()

            cols = avg_player_data.columns

            df.loc[(df['SEASON_YEAR'] == season) & (df['PLAYER_ID'] == p_id), cols] = avg_player_data

    df.to_csv('player_avg_last10.csv', index=False)
        
    return df
    
player_stats = aggregate_player_stats(df=player_df)

Progress: 100%|██████████████████████████████████████████████████████████████████████| 441/441 [01:09<00:00,  6.33it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 440/440 [01:11<00:00,  6.13it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 428/428 [01:05<00:00,  6.54it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 442/442 [01:08<00:00,  6.47it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 464/464 [01:12<00:00,  6.40it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 458/458 [01:13<00:00,  6.27it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 458/458 [01:13<00:00,  6.23it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████████| 451/451 [01:10<00:00,  6.40it/s]
Progress: 100%|█████████████████████████

In [70]:
player_stats.loc[player_stats['GAME_ID']==21000075].sort_values('MIN', ascending=False)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,FG2M,FG2A,PTS_2PT_MR,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
23588,2010-11,MIA,LeBron James,2544,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,34.323333,1.0,3.4,6.2,8.0,0.0,4.6,7.2,5.2,1.4,0.8,0.0,1.8,4.4,20.4,14.0,109.96,83.66,24.92,16.64,0.3224,94.648,66.8,0.1922,5.6,10.6,4.0006,4.8004,5.1968,7.2038,1.801,3.799,1.0,0.0
23519,2010-11,MIA,Chris Bosh,2547,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.736667,0.0,0.2,3.8,4.6,1.0,5.4,1.6,0.8,1.0,1.0,0.8,2.2,2.8,13.0,15.0,117.26,87.02,10.52,4.7,0.2164,92.952,60.6,0.1264,4.6,10.6,4.3996,0.0,2.201,4.8006,2.7998,1.8002,0.0,0.0
23568,2010-11,MIA,Dwyane Wade,2548,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,31.570667,1.0,2.6,5.8,8.0,1.2,3.8,4.2,3.0,1.6,1.0,1.0,2.8,5.4,22.4,10.8,113.96,90.06,15.52,10.78,0.3498,93.192,60.0,0.1774,6.8,13.8,1.6024,5.0016,6.0002,12.0014,2.1996,4.6004,0.2,0.8
23721,2010-11,MIA,James Jones,2592,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,27.129,3.6,6.8,0.0,0.0,0.2,2.2,0.8,0.0,0.6,0.4,0.0,2.0,1.2,11.6,6.2,105.4,91.18,10.04,0.0,0.1364,94.588,53.6,0.1214,0.4,1.0,0.8012,1.1984,2.4006,0.0,0.4,0.0,3.6,0.0
23678,2010-11,MIA,Carlos Arroyo,2306,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,23.359667,0.4,0.8,1.2,1.4,0.4,3.2,2.4,1.0,0.6,0.0,0.0,1.4,0.8,7.2,13.4,111.9,82.1,23.26,9.62,0.1394,91.746,43.8,0.1214,2.4,4.6,4.4012,0.3996,0.9996,0.3996,1.6,0.8,0.4,0.0
23585,2010-11,MIA,Udonis Haslem,2617,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.872,0.0,0.0,1.6,2.0,1.8,7.0,0.2,0.8,0.4,0.4,0.4,3.6,2.4,7.2,2.8,98.2,91.04,2.22,12.52,0.151,96.178,45.8,0.1274,2.8,5.8,3.9988,0.8004,1.2,1.6008,2.4,0.4,0.0,0.0
23741,2010-11,MIA,Eddie House,2067,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,22.689667,2.0,3.2,1.0,1.2,0.0,1.8,1.8,0.8,1.4,0.0,0.2,1.4,0.6,9.0,-0.2,98.72,98.4,19.54,7.78,0.1596,93.754,44.4,0.1304,1.0,3.4,1.9994,1.0,0.6,0.0,0.6,0.4,1.8,0.2
23510,2010-11,MIA,Joel Anthony,201202,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,20.329333,0.0,0.0,0.6,1.2,1.2,3.2,0.6,0.4,0.4,1.4,0.0,3.4,1.0,1.4,7.8,101.32,76.84,20.0,30.0,0.0406,91.318,37.6,0.054,0.4,0.6,0.0,0.4,0.2,0.8,0.2,0.2,0.0,0.0
23739,2010-11,MIA,Zydrunas Ilgauskas,980,NOP,MIA,21000075,2010-11-05,MIA @ NOH,0,0,14.649667,0.0,0.0,0.8,1.2,2.2,2.0,0.6,0.6,0.6,0.8,0.0,3.6,1.0,6.0,10.2,127.04,85.52,7.3,15.56,0.19,97.444,29.8,0.0874,2.6,4.8,3.6004,0.0,0.7996,1.5996,2.0002,0.5998,0.0,0.0
23577,2010-11,NOH,Chris Paul,101108,NOP,MIA,21000075,2010-11-05,NOH vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
player_stats1 = pd.read_csv('player_l10_avg_part1.csv', parse_dates=['GAME_DATE'])
player_stats2 = pd.read_csv('player_l10_avg_part2.csv', parse_dates=['GAME_DATE'])

player_stats = pd.concat([player_stats1, player_stats2])

In [None]:
player_stats = player_stats.sort_values('GAME_DATE')
player_stats

In [None]:
set(player_stats['GAME_ID'].unique()) - set(all_team_gamelogs['GAME_ID'].unique()) 

In [72]:
player_stats = player_stats.dropna()

In [73]:
seasons = ["200{}-0{}".format(x, x+1) if x!=9 else "200{}-{}".format(x, x+1) for x in range(0, 10)]
seasons2 = ["20{}-{}".format(x, x+1) for x in range(10, 20)]
seasons.extend(seasons2)
print(seasons)

def prep_data_for_model(df = player_stats):
    matchup_info = []
    matchups = []
    targets = []
    
    all_gamelogs = []
    for season in seasons:
        gl = pd.read_csv('../data/basic_team_boxscores/team_gamelogs_{}.csv'.format(season))
        all_gamelogs.append(gl)

    all_team_gamelogs = pd.concat(all_gamelogs)
    game_results = all_team_gamelogs.loc[all_team_gamelogs['MATCHUP'].str.contains('vs'), ['GAME_ID', 'PLUS_MINUS']]

    game_ids = set(player_stats['GAME_ID'].unique()) & set(all_team_gamelogs['GAME_ID'].unique()) 
    
    for game_id in tqdm(game_ids, desc="progress"):
        game_df = df.loc[df['GAME_ID'] == game_id]
   
        home = game_df.loc[game_df['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
        home_stats = home.iloc[:10, 11:].values
        
        away = game_df.loc[game_df['HOME_GAME'] == 0].sort_values('MIN', ascending=False)
        away_stats = away.iloc[:10, 11:].values
                
        # If a team had one of their game's removed (because it was in the first 5 games of the season), remove the entire game
        if (home_stats.shape[0] == 0) or (away_stats.shape[0] == 0):
            continue
        
        if home_stats.shape[0] < 10:
            missing_rows = 10 - home_stats.shape[0]
            padding = np.zeros((missing_rows, home_stats.shape[1]))
            home_stats = np.concatenate([home_stats, padding], axis=0)
        
        if away_stats.shape[0] < 10:
            missing_rows = 10 - away_stats.shape[0]
            padding = np.zeros((missing_rows, away_stats.shape[1]))
            away_stats = np.concatenate([away_stats, padding], axis=0)

        matchup = np.stack([home_stats, away_stats], axis=0)
        matchups.append(matchup)
        
        point_diff = game_results.loc[game_results['GAME_ID'] == game_id, 'PLUS_MINUS'].values[0]
        targets.append(point_diff)
        
        matchup_info.append(home[['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL']].iloc[0].values)
        
    matchup_features = np.stack(matchups, axis=0)
    matchup_info = pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL'])
    
    return matchup_info, matchup_features, np.array(targets).reshape(-1, 1)

matchup_info, matchup_features, targets = prep_data_for_model(df = player_stats)

['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']


progress: 100%|█████████████████████████████████████████████████████████████████| 24275/24275 [02:05<00:00, 193.08it/s]


In [74]:
matchup_info.shape, matchup_features.shape, targets.shape

((24089, 6), (24089, 2, 10, 34), (24089, 1))

In [75]:
np.isnan(matchup_features).sum()

0

In [50]:
pd.DataFrame(matchup_info, columns=['SEASON_YEAR', 'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'WL'])

Unnamed: 0,SEASON_YEAR,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,WL
0,2015-16,ATL,DET,21500001,2015-10-27,0
1,2015-16,CHI,CLE,21500002,2015-10-27,1
2,2015-16,GSW,NOP,21500003,2015-10-27,1
3,2015-16,ORL,WAS,21500004,2015-10-28,0
4,2015-16,BOS,PHI,21500005,2015-10-28,1
...,...,...,...,...,...,...
25667,2005-06,DAL,MIA,40500402,2006-06-11,1
25668,2005-06,MIA,DAL,40500403,2006-06-13,1
25669,2005-06,MIA,DAL,40500404,2006-06-15,1
25670,2005-06,MIA,DAL,40500405,2006-06-18,1


In [51]:
home_cols = ["home_"+x for x in player_stats.columns[11:]]
away_cols = ["away_"+x for x in player_stats.columns[11:]]

cols = home_cols
pd.DataFrame(matchup_features[1011][0], columns=cols)


Unnamed: 0,home_MIN,home_FG3M,home_FG3A,home_FTM,home_FTA,home_OREB,home_DREB,home_AST,home_TOV,home_STL,home_BLK,home_BLKA,home_PF,home_PFD,home_PTS,home_PLUS_MINUS,home_E_OFF_RATING,home_E_DEF_RATING,home_AST_RATIO,home_TM_TOV_PCT,home_E_USG_PCT,home_E_PACE,home_POSS,home_PIE,home_FG2M,home_FG2A,home_PTS_2PT_MR,home_PTS_FB,home_PTS_OFF_TOV,home_PTS_PAINT,home_AST_2PM,home_UAST_2PM,home_AST_3PM,home_UAST_3PM
0,35.477833,1.1,3.8,3.8,4.2,0.6,3.5,4.3,2.3,1.9,0.4,0.8,2.4,3.3,17.1,-4.3,101.13,105.85,19.62,10.25,0.2291,100.116,73.4,0.1073,5.0,10.5,3.3997,4.0016,3.7013,6.601,1.1003,3.8997,0.7,0.4
1,34.194333,2.6,6.5,4.7,5.3,1.4,5.1,4.4,2.5,2.2,0.5,1.2,2.9,4.2,22.9,-3.8,102.19,107.52,16.12,9.09,0.3011,99.896,70.4,0.1399,5.2,11.9,5.3997,2.1988,2.3003,5.0035,2.0007,3.1993,1.9,0.7
2,33.296667,1.4,4.6,2.1,2.5,1.5,3.4,4.0,1.3,0.5,0.2,0.5,2.3,2.1,10.5,-2.2,101.6,107.03,24.57,8.09,0.1683,96.293,66.3,0.0769,2.1,5.3,2.0014,1.9,1.9002,2.1995,0.7002,1.3998,1.2,0.2
3,26.556333,0.2,0.6,1.6,1.8,0.9,4.2,0.5,1.1,0.5,1.1,0.2,2.9,2.0,10.0,-3.4,96.37,103.89,4.61,8.1,0.205,97.783,54.0,0.0583,3.9,9.6,3.2009,0.1998,0.5995,4.6002,3.3002,0.5998,0.2,0.0
4,20.2635,0.0,0.0,1.6,1.8,1.7,3.6,0.9,1.3,0.1,0.6,0.4,2.3,1.3,10.2,-2.7,103.21,107.53,8.69,12.37,0.2358,93.755,39.7,0.1202,4.3,8.1,2.6007,0.0,1.3997,6.0003,2.6998,1.6002,0.0,0.0
5,18.656667,0.0,0.0,0.8,1.1,2.3,3.2,1.2,0.6,0.2,0.9,0.2,2.4,0.9,4.6,0.0,106.02,104.96,18.51,10.43,0.1084,95.309,37.0,0.0959,1.9,3.3,1.0002,0.2,0.8998,2.7998,1.3003,0.5997,0.0,0.0
6,17.593167,0.4,1.1,1.9,2.4,0.1,1.6,1.8,1.0,0.7,0.1,0.3,0.7,1.9,7.1,-1.3,102.19,106.34,16.93,10.98,0.1982,95.545,35.2,0.099,2.0,4.5,1.9995,0.6992,0.5997,2.0001,1.0,1.0,0.3,0.1
7,16.465333,1.3,4.4,0.7,1.1,0.2,1.6,0.7,1.5,0.3,0.3,0.1,1.6,1.3,9.0,-3.5,96.26,111.16,5.56,14.47,0.28,95.53,33.1,0.0313,2.2,4.2,1.6002,0.8995,1.2001,2.8,1.2002,0.9998,1.1,0.2
8,16.408833,0.4,1.2,0.5,0.6,0.8,2.3,1.1,0.3,0.6,0.2,0.2,1.0,0.5,3.7,2.4,105.73,92.74,17.61,7.78,0.0848,96.392,32.7,0.1267,1.0,1.5,0.2,0.4,1.1001,1.8001,0.6,0.4,0.4,0.0
9,11.575833,0.3,1.2,0.3,0.4,0.0,1.3,1.5,0.9,0.2,0.0,0.0,1.2,0.5,4.2,-1.5,96.11,102.01,17.48,16.21,0.2441,94.088,23.0,0.0429,1.5,3.7,1.4,0.7,0.8,1.6,0.3,1.2,0.2,0.1


In [82]:
from sklearn.preprocessing import StandardScaler
# Split up data - training set and testing set
X_train, X_test, y_train, y_test = train_test_split(matchup_features, targets, test_size=0.2, shuffle=False)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


print("X_train shape:", X_train.shape,
      "\nX_test shape:", X_test.shape,
      "\ny_test shape:", y_train.shape,
      "\ny_train shape:", y_test.shape)

X_train shape: (19271, 2, 10, 34) 
X_test shape: (4818, 2, 10, 34) 
y_test shape: (19271, 1) 
y_train shape: (4818, 1)


In [83]:
class PlayerBoxScoreMatchupsDataset(Dataset):
    """"""
    def __init__(self, matchup_features, targets):
        self.targets = targets
        self.matchup_data = matchup_features
        
    def __len__(self):
        return len(self.matchup_data)
    
    def __getitem__(self, index):
        X = self.matchup_data[index]
        y = self.targets[index]
        
        return X, y
    

In [84]:
# Hyperparameters
batch_size = 16

# Create DataSet and DataLoaders

training_set = PlayerBoxScoreMatchupsDataset(X_train, y_train)
validation_set = PlayerBoxScoreMatchupsDataset(X_test, y_test)

train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)

# print(data, target)
# data = torch.rand(1, 2, 10, 34)
# data

In [85]:
data, target = next(iter(train_loader))
print(data.shape, target.shape)
data

torch.Size([16, 2, 10, 34]) torch.Size([16, 1])


tensor([[[[35.5953,  1.6000,  3.2000,  ...,  3.0008,  0.9998,  0.6002],
          [30.5140,  0.0000,  1.6000,  ...,  3.6000,  0.0000,  0.0000],
          [29.1373,  2.8000,  6.6000,  ...,  1.0002,  2.4002,  0.3998],
          ...,
          [18.7370,  0.0000,  0.0000,  ...,  0.1998,  0.0000,  0.0000],
          [16.8393,  0.4000,  1.6000,  ...,  1.6002,  0.2000,  0.2000],
          [16.5167,  1.4000,  4.2000,  ...,  2.0002,  0.6000,  0.8000]],

         [[37.6890,  2.8000,  6.8000,  ...,  2.7992,  2.4002,  0.3998],
          [35.4533,  2.2000,  6.0000,  ...,  7.8002,  0.9998,  1.2002],
          [32.8433,  0.0000,  0.6000,  ...,  1.4000,  0.0000,  0.0000],
          ...,
          [18.7067,  1.4000,  2.4000,  ...,  1.1998,  1.0000,  0.4000],
          [12.5740,  1.0000,  2.6000,  ...,  0.0000,  1.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],


        [[[37.4133,  3.2000,  8.6000,  ...,  6.4002,  1.8000,  1.4000],
          [35.2657,  2.8000,

In [55]:
X_train_tensor = torch.from_numpy(X_train).float()[:batch_size]
y_train_tensor = torch.from_numpy(y_train).float()[:batch_size]
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test).float()

print("X_train_tensor shape:", X_train_tensor.shape,
      "\nX_test_tensor shape:", X_test_tensor.shape,
      "\ny_test_tensor shape:", y_train_tensor.shape,
      "\ny_train_tensor shape:", y_test_tensor.shape)

X_train_tensor shape: torch.Size([16, 2, 10, 34]) 
X_test_tensor shape: torch.Size([5135, 2, 10, 34]) 
y_test_tensor shape: torch.Size([16, 1]) 
y_train_tensor shape: torch.Size([5135, 1])


In [87]:
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv1d(2, 2, kernel_size=(10,1), stride=1)
#         self.conv1_bn = nn.BatchNorm2d(2)
        
        self.fc1 = nn.Linear(68, 32)
#         self.fc1_bn = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(32, 1)
# # #         self.fc2_bn = nn.BatchNorm1d(16)
#         self.fc3 = nn.Linear(32, 1)
        
#         self.dropout = nn.Dropout(0.1)
        
        
    def forward(self, x):
#         print("original_shape:", x.shape)
        x = F.tanh(self.conv1(x))
#         print("shape after conv1:", x.shape)
        x = x.reshape(-1, 68)
#         x = self.dropout(x)
        x = F.tanh(self.fc1(x))
#         x = self.dropout(x)
        x = self.fc2(x)
# # #         x = self.dropout(x)
#         x = self.fc3(x)
        
        return x
    

import torch.optim as optim

net = Net()
loss_function = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

epochs = 100
batch_size = 16

train_losses, test_losses = [], []

for e in range(epochs):
    train_loss = 0
    test_loss = 0
    net.train()
    
    net.zero_grad()     
    for i, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()

        output = net(data.float())
# #     print("output:", output)
# #     print("target:", target)

        loss= loss_function(output, target.float())

#     if e == 99:
# #         print("input 1", data[0][0][0], "\ninput 2:", data[1][0][0])
#         print("outputs:", output.flatten(), "output shape:", output.shape)
#         print("targets:", target.flatten(), "target shape:", target.shape)
#         print("loss:", loss)

        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    else:
        with torch.no_grad():
            net.eval()
            for data, target in val_loader:
                output = net(data.float())
                loss = loss_function(output, target.float())

                test_loss += loss.item()

    train_losses.append(train_loss / len(train_loader))
    test_losses.append(test_loss / len(val_loader))

    print('Epoch: {} \tTraining Loss: {:.4f} \tValidation Loss: {:.4f}'.format(e, train_loss, test_loss))
    


Epoch: 0 	Training Loss: 211080.0222 	Validation Loss: 51375.3583
Epoch: 1 	Training Loss: 205486.7094 	Validation Loss: 50086.6072
Epoch: 2 	Training Loss: 198874.8333 	Validation Loss: 48187.3636
Epoch: 3 	Training Loss: 192789.7758 	Validation Loss: 47043.8888
Epoch: 4 	Training Loss: 189716.8616 	Validation Loss: 46527.0694
Epoch: 5 	Training Loss: 188135.7515 	Validation Loss: 46321.3201
Epoch: 6 	Training Loss: 187340.8763 	Validation Loss: 46184.9713
Epoch: 7 	Training Loss: 186798.4702 	Validation Loss: 46089.8523
Epoch: 8 	Training Loss: 186418.7857 	Validation Loss: 46029.5646
Epoch: 9 	Training Loss: 186121.7168 	Validation Loss: 45972.4253
Epoch: 10 	Training Loss: 185680.7474 	Validation Loss: 46081.2385
Epoch: 11 	Training Loss: 185370.8917 	Validation Loss: 46020.0926
Epoch: 12 	Training Loss: 184985.1575 	Validation Loss: 45926.3602
Epoch: 13 	Training Loss: 184504.4623 	Validation Loss: 45559.4410
Epoch: 14 	Training Loss: 183777.7466 	Validation Loss: 45356.0550
Epoch

In [93]:
train_preds = np.array([])
for data, label, in train_loader:
    train_preds = np.concatenate([train_preds, net(data.float()).detach().numpy().flatten()])

train_pred_df = pd.DataFrame({"target":y_train.flatten(), "train_preds":train_preds})
train_pred_df

Unnamed: 0,target,train_preds
0,6,-1.566895
1,19,2.367688
2,5,-0.298211
3,-7,3.699419
4,3,-2.678554
...,...,...
19266,-16,0.029836
19267,7,-2.321544
19268,10,4.397268
19269,-17,12.866269


In [None]:
print()

In [352]:
 np.empty((1,0))

array([], shape=(1, 0), dtype=float64)

In [648]:
preds = np.empty((1,0))
for data, target in val_loader:
    
    output = net(data.float())
    preds  = np.append(preds, output.detach().numpy().T)

    
    
preds
    

array([ 0.81842214,  0.4361662 ,  0.91527337, ..., -0.16868234,
        0.06047469,  0.06885672])

In [649]:
pd.DataFrame(preds)

Unnamed: 0,0
0,0.818422
1,0.436166
2,0.915273
3,-0.379051
4,-0.287050
...,...
2280,0.641530
2281,0.032958
2282,-0.168682
2283,0.060475


## Gather Betting Data

In [359]:
# read betting data
betting_data = pd.read_csv('./data/nba_betting_data_2010_present.csv')
betting_data = betting_data.dropna()
    
def clean_bet_data(df = betting_data):

    
    df['date'] = pd.to_datetime(df['date'])
    df['home_team_abbr'] = df['home_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'
                                                                                        }
                                                                             )
    df['away_team_abbr'] = df['away_team_abbr'].replace({'NY':'NYK',
                                                                            'GS':'GSW',
                                                                            'SA':'SAS',
                                                                            'BK':'BKN',
                                                                            'NO':'NOP',
                                                                            'PHO':'PHX'}
                                                                           )

    teams = df['home_team_abbr'].unique()

    df = df.sort_values(['date'])
    df['rest'] = np.nan
    for team in teams:
        team_data = df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team)]
        team_data['rest'] = (team_data['date'] - team_data['date'].shift(1)) / np.timedelta64(1, 'D')
        df.loc[(df['home_team_abbr'] == team) | (df['away_team_abbr'] == team), 'rest'] = team_data['rest']


    df['point_diff'] = df['home_score'] - df['away_score']
    df['point_total'] = df['home_score'] + df['away_score']
    df['covered_spread'] = (df['home_score'] + df['spread'] > df['away_score']).astype(int)
    df['over'] = (df['point_total'] > df['total']).astype(int)
    
    df['prev_cover'] = df['covered_spread'].shift(1)
    df['prev2_cover'] = df['covered_spread'].shift(2)
    
    relevant_betting = df[['date', 'home_team_abbr',  'away_team_abbr',
                           'home_score', 'away_score', 'point_diff', 
                           'rest', 'point_total', 'spread', 'total',
                          'covered_spread', 'over', 'prev_cover', 'prev2_cover']]

    return relevant_betting

bet_data_clean = clean_bet_data()

array([ 12, -19,  12, ...,   1,  11,   8], dtype=int64)

In [360]:
bet_data_clean[['']]

Unnamed: 0,date,home_team_abbr,away_team_abbr,home_score,away_score,point_diff,rest,point_total,spread,total,covered_spread,over,prev_cover,prev2_cover
1012,2011-12-25,NYK,BOS,106,104,2,,210,-5.0,190.5,0,1,,
1013,2011-12-25,DAL,MIA,94,105,-11,,199,4.5,188.5,0,1,0.0,
1014,2011-12-25,LAL,CHI,87,88,-1,,175,4.5,183.5,1,0,0.0,0.0
1015,2011-12-25,OKC,ORL,97,89,8,,186,-7.5,194.5,1,0,1.0,0.0
1016,2011-12-25,GSW,LAC,86,105,-19,,191,6.0,207.5,0,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11935,2021-02-04,LAL,DEN,114,93,21,4.0,207,-5.0,217.0,1,0,1.0,1.0
11934,2021-02-04,MEM,HOU,103,115,-12,1.0,218,-3.5,226.0,0,0,1.0,1.0
11933,2021-02-04,PHI,POR,105,121,-16,1.0,226,-10.0,223.0,0,1,0.0,1.0
11932,2021-02-04,DAL,GSW,116,147,-31,2.0,263,-4.0,229.5,0,1,0.0,0.0
