In [1]:
# Change directory so that code in "src" directory is easily importable.
import os
os.chdir('../')

In [61]:
import pandas as pd
import numpy as np
import re

from pyathena import connect
from pyathena.pandas_cursor import PandasCursor

from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

from mord import LogisticAT

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score, accuracy_score, balanced_accuracy_score
from sklearn.metrics import make_scorer, SCORERS, f1_score, precision_score, recall_score

import tensorflow as tf
from tensorflow import keras

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Define connection to DB
conn = connect(
    s3_staging_dir='s3://aws-athena-query-results-323906537337-us-east-1/',
    region_name='us-east-1',
    cursor_class=PandasCursor
    )
cursor = conn.cursor()

# 0 Data
## 0.1 Game data
### (1) game stats

In [None]:
simple_query = f'''
WITH team_home_away AS
    (SELECT api.eventmetadata.gamecode,
         teams.teamid,
         if(teams.teamlocationtype.teamlocationtypeid = 1,
         1,
         0) AS ishometeam
    FROM datalakebasketball.api_events api, UNNEST(api.event.teams) t(teams)
    WHERE leagueid = '1'
            AND season >= '2004'
            AND season <= '2018' )
            
SELECT 
        t1.season,
        t1.eventmetadata.gamecode,
         t1.eventmetadata.eventtypeid,
         t1.teamid,
         t1.points,
         t1.opponentteamid,
         t1.pointsconceded,
         t2.ishometeam
FROM datalakebasketball.team_stats_game t1
LEFT JOIN team_home_away t2
    ON t1.eventmetadata.gamecode=t2.gamecode
        AND t1.teamid=t2.teamid
WHERE t1.leagueid = '1'
        AND t1.eventstatus.name = 'Final'
        AND t1.season >= '2004'
        AND t1.season <= '2018'
        AND (t1.eventmetadata.eventtypeid = 1 OR t1.eventmetadata.eventtypeid = 2)
        AND t2.ishometeam = 1
ORDER BY t1.season, t1.eventmetadata.gamecode 
'''

if True:
    game_df = cursor.execute(simple_query).as_pandas()
    print(game_df.info())
else:
    print("Failed to query!")

In [3]:
simple_query = f'''
WITH game_opponents AS
(
    select 
      eventmetadata.gamecode game_code,
      event.teams[1].teamid team1_id,
      event.teams[2].teamid team2_id
    FROM datalakebasketball.api_events
    where leagueid = '1' 
        and season >= '2004'
)
            
select
  cast(t1.season as integer) season,
  t1.eventmetadata.gamecode game_code, 
  DATE_FORMAT(from_unixtime(t1.eventmetadata.gamedateutcepoch), '%Y-%m-%d') date,
  t1.teamid team_id,
  if(t1.teamid = t2.team1_id, t2.team2_id, t2.team1_id) opp_team_id,
  t1.points,
  t1.fieldgoals.attempted fg_attempted, 
  t1.fieldgoals.made fg_made,
  t1.freethrows.attempted ft_attempted, 
  t1.freethrows.made ft_made,
  t1.rebounds.offensive offensive_rebounds, 
  t1.rebounds.defensive defensive_rebounds,
  t1.turnovers.total + t1.turnovers.team turnovers  
from
  datalakebasketball.team_stats_game t1
  LEFT JOIN game_opponents t2
    ON t1.eventmetadata.gamecode=t2.game_code
where
  t1.season>='2004' and t1.season<='2018'
  and t1.leagueid='1'
  and t1.teamid not in (53, 54)
  and t1.eventmetadata.eventtypeid=1
  and t1.points is not null
order by t1.season, t1.teamid, t1.eventmetadata.gamedateutcepoch -- eventmetadata.gamecode
'''

if True:
    stats_df = cursor.execute(simple_query).as_pandas()
    print(stats_df.info())
else:
    print("Failed to query!")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36418 entries, 0 to 36417
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   season              36418 non-null  Int64 
 1   game_code           36418 non-null  Int64 
 2   date                36418 non-null  object
 3   team_id             36418 non-null  Int64 
 4   opp_team_id         36418 non-null  Int64 
 5   points              36418 non-null  Int64 
 6   fg_attempted        36418 non-null  Int64 
 7   fg_made             36418 non-null  Int64 
 8   ft_attempted        36418 non-null  Int64 
 9   ft_made             36418 non-null  Int64 
 10  offensive_rebounds  36418 non-null  Int64 
 11  defensive_rebounds  36418 non-null  Int64 
 12  turnovers           36418 non-null  Int64 
dtypes: Int64(12), object(1)
memory usage: 4.0+ MB
None


In [None]:
# why the stats for those games are missing?
# do I need to impute the missing values for those games?
# some games are simply postponed, such as 917610. Maybe the solution is filter-out those cases

In [4]:
# self join to create opponent parameters!
stats_df_opp = stats_df.drop(columns=['opp_team_id'])
id = stats_df_opp.columns.isin(['season','game_code','date','team_id'])
stats_df_opp.rename(columns={name:name+"_conceded" for name in stats_df_opp.columns[~id]}, inplace=True)
stats_df_opp.rename(columns={'team_id':'opp_team_id'}, inplace=True)

stats_df = pd.merge(stats_df, stats_df_opp, how='left', on=['season','game_code','date','opp_team_id'])

In [5]:
# groupby previous - seasonal average results
season_df = stats_df.drop(columns=['game_code','date','opp_team_id'])
season_df['season'] = season_df['season'] + 1
season_df = season_df.loc[season_df.season <= 2018]
gd = season_df.groupby(['season','team_id'])
season_df = gd.mean().reset_index()
season_df.head()

Unnamed: 0,season,team_id,points,fg_attempted,fg_made,ft_attempted,ft_made,offensive_rebounds,defensive_rebounds,turnovers,points_conceded,fg_attempted_conceded,fg_made_conceded,ft_attempted_conceded,ft_made_conceded,offensive_rebounds_conceded,defensive_rebounds_conceded,turnovers_conceded
0,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098
1,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171
2,2005,3,88.439024,79.95122,33.146341,22.317073,17.085366,12.426829,27.817073,14.841463,95.512195,76.817073,34.719512,25.878049,20.04878,11.54878,30.304878,14.243902
3,2005,4,94.45122,80.390244,34.743902,24.97561,18.731707,12.207317,31.597561,16.719512,93.390244,79.853659,33.695122,28.109756,20.926829,11.939024,30.780488,15.512195
4,2005,5,96.512195,81.54878,36.463415,26.512195,19.926829,13.621951,28.682927,13.914634,95.719512,78.585366,35.52439,26.036585,19.52439,11.439024,28.341463,14.792683


In [6]:
# Use previous seasonal average to create initial values for current season
tmp = pd.DataFrame({'date':['00','01','02','03','04'], 'game_code':[0,1,2,3,4], 'opp_team_id':[0,0,0,0,0]})
tmp['key'] = 0
season_df['key'] = 0
initial_df = season_df.merge(tmp, how='left', on = 'key').drop(columns=['key'])
initial_df.head(11)

Unnamed: 0,season,team_id,points,fg_attempted,fg_made,ft_attempted,ft_made,offensive_rebounds,defensive_rebounds,turnovers,points_conceded,fg_attempted_conceded,fg_made_conceded,ft_attempted_conceded,ft_made_conceded,offensive_rebounds_conceded,defensive_rebounds_conceded,turnovers_conceded,date,game_code,opp_team_id
0,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,0,0,0
1,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,1,1,0
2,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,2,2,0
3,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,3,3,0
4,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,4,4,0
5,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171,0,0,0
6,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171,1,1,0
7,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171,2,2,0
8,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171,3,3,0
9,2005,2,101.268293,79.402439,37.146341,28.329268,21.646341,11.085366,29.731707,15.817073,100.402439,81.268293,36.097561,29.54878,22.256098,12.670732,29.597561,15.573171,4,4,0


In [7]:
# combine initial values with current season game values
stats_df = stats_df.loc[stats_df.season > 2004]
stats_df = pd.concat([initial_df, stats_df], axis=0)
stats_df.sort_values(by=['season','team_id','date'], axis=0, inplace=True)
stats_df.head(7)

Unnamed: 0,season,team_id,points,fg_attempted,fg_made,ft_attempted,ft_made,offensive_rebounds,defensive_rebounds,turnovers,points_conceded,fg_attempted_conceded,fg_made_conceded,ft_attempted_conceded,ft_made_conceded,offensive_rebounds_conceded,defensive_rebounds_conceded,turnovers_conceded,date,game_code,opp_team_id
0,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,00,0,0
1,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,01,1,0
2,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,02,2,0
3,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,03,3,0
4,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,04,4,0
2460,2005,1,97.0,93.0,39.0,21.0,16.0,20.0,27.0,22.0,122.0,74.0,43.0,45.0,29.0,11.0,27.0,20.0,2005-11-03,659672,9
2461,2005,1,77.0,69.0,24.0,34.0,23.0,13.0,27.0,21.0,92.0,77.0,34.0,34.0,23.0,17.0,34.0,18.0,2005-11-05,659689,12


In [8]:
# calculate moving avg of last 5 games

id = stats_df.columns.isin(['season', 'team_id', 'opp_team_id', 'game_code', 'date'])
stats_df.rename(columns={name:name+'_l5' for name in stats_df.columns[~id]}, inplace=True)

gd = stats_df.groupby(by=['season','team_id'])
stats_df.loc[:,~id] = gd.apply(lambda df: df.loc[:,~id].rolling(5).mean().shift(1))

stats_df = stats_df.loc[stats_df.game_code>4,:]
stats_df.head(7)

Unnamed: 0,season,team_id,points_l5,fg_attempted_l5,fg_made_l5,ft_attempted_l5,ft_made_l5,offensive_rebounds_l5,defensive_rebounds_l5,turnovers_l5,points_conceded_l5,fg_attempted_conceded_l5,fg_made_conceded_l5,ft_attempted_conceded_l5,ft_made_conceded_l5,offensive_rebounds_conceded_l5,defensive_rebounds_conceded_l5,turnovers_conceded_l5,date,game_code,opp_team_id
2460,2005,1,92.743902,81.365854,35.878049,24.317073,17.280488,13.414634,28.47561,16.085366,102.45122,77.914634,37.121951,29.219512,22.512195,11.0,30.280488,14.756098,2005-11-03,659672,9
2461,2005,1,93.595122,83.692683,36.502439,23.653659,17.02439,14.731707,28.180488,17.268293,106.360976,77.131707,38.297561,32.37561,23.809756,11.0,29.62439,15.804878,2005-11-05,659689,12
2462,2005,1,90.446341,81.219512,34.126829,25.590244,18.168293,14.64878,27.885366,18.25122,104.270732,76.94878,37.673171,33.331707,23.907317,12.2,30.368293,16.453659,2005-11-06,659705,22
2463,2005,1,90.497561,79.746341,33.95122,25.326829,17.912195,13.565854,26.990244,16.834146,102.580488,76.765854,36.84878,34.687805,24.204878,12.8,29.912195,16.902439,2005-11-09,659729,13
2464,2005,1,91.34878,78.473171,34.17561,25.663415,18.256098,12.682927,27.495122,18.017073,102.690244,75.782927,37.02439,35.443902,23.702439,11.8,29.056098,16.75122,2005-11-11,659763,12
2465,2005,1,91.8,80.4,34.4,26.2,18.8,13.4,26.8,17.2,102.6,75.2,36.8,36.6,24.6,11.6,30.4,17.4,2005-11-13,659783,29
2466,2005,1,89.2,78.4,33.2,25.8,18.4,12.6,27.2,15.2,95.2,74.8,34.0,32.8,22.8,11.4,30.6,16.0,2005-11-16,659817,24


In [9]:
# stats_df add index for each team
gd = stats_df.groupby(['season','team_id'])
stats_df['game_num'] = gd.cumcount() + 1
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33958 entries, 2460 to 36417
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          33958 non-null  Int64  
 1   team_id                         33958 non-null  Int64  
 2   points_l5                       33958 non-null  float64
 3   fg_attempted_l5                 33958 non-null  float64
 4   fg_made_l5                      33958 non-null  float64
 5   ft_attempted_l5                 33958 non-null  float64
 6   ft_made_l5                      33958 non-null  float64
 7   offensive_rebounds_l5           33958 non-null  float64
 8   defensive_rebounds_l5           33958 non-null  float64
 9   turnovers_l5                    33958 non-null  float64
 10  points_conceded_l5              33958 non-null  float64
 11  fg_attempted_conceded_l5        33958 non-null  float64
 12  fg_made_conceded_l5          

### (2) Elo rating

In [19]:
# Note: in elo_data, team_1 is home team
os.getcwd()
elo_data = pd.read_csv('runtime/datasets/nba_elo.csv')
elo_data.season = elo_data.season - 1
elo_data = elo_data[(elo_data.season>=2005) & (elo_data.season<=2018)]

In [20]:
# drop not needed columns
# only keep regular season games
id = [bool(re.match("(carm)|(raptor)|(playoff)|(neutral).*", i)) for i in elo_data.columns]
elo_data=elo_data.loc[elo_data.playoff.isna(), ~np.array(id)]
elo_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16979 entries, 50257 to 68323
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       16979 non-null  object 
 1   season     16979 non-null  int64  
 2   team1      16979 non-null  object 
 3   team2      16979 non-null  object 
 4   elo1_pre   16979 non-null  float64
 5   elo2_pre   16979 non-null  float64
 6   elo_prob1  16979 non-null  float64
 7   elo_prob2  16979 non-null  float64
 8   elo1_post  16979 non-null  float64
 9   elo2_post  16979 non-null  float64
 10  score1     16979 non-null  float64
 11  score2     16979 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 1.7+ MB


In [21]:
# add team_id
team_dict = {'ATL':1, 'BOS':2, 'BRK':17, 'CHI':4, 'CHO':5312, 'CLE':5, 'DAL':6, 'DEN':7, 'DET':8,
       'GSW':9, 'HOU':10, 'IND':11, 'LAC':12, 'LAL':13, 'MEM':29, 'MIA':14, 'MIL':15, 'MIN':16,
       'NJN':17, 'NOK':3, 'NOP':3, 'NYK':18, 'OKC':25, 'ORL':19, 'PHI':20, 'PHO':21, 'POR':22,
       'SAC':23, 'SAS':24, 'SEA':25, 'TOR':28, 'UTA':26, 'WAS':27}
team_dict=pd.DataFrame(pd.Series(team_dict) ).reset_index()
team_dict.columns = ['team','team_id']

elo_data = pd.merge(elo_data, team_dict, left_on='team1', right_on='team', how='left')
elo_data.drop('team', axis=1, inplace=True)
elo_data.rename(columns={'team_id':'team1_id'}, inplace=True)

elo_data = pd.merge(elo_data, team_dict, left_on='team2', right_on='team', how='left')
elo_data.drop('team', axis=1, inplace=True)
elo_data.rename(columns={'team_id':'team2_id'}, inplace=True)

In [22]:
# add game_num for each team
# concat game team pair into one team dataframe
df_1 = elo_data.loc[:,['date','season','team1_id']]
df_2 = elo_data.loc[:,['date','season','team2_id']]

df_1.rename(columns={'team1_id':'team_id'}, inplace=True)
df_2.rename(columns={'team2_id':'team_id'}, inplace=True)

df = pd.concat([df_1,df_2], axis=0).sort_values(by=['season','team_id','date'])

# add game_num
gd = df.groupby(['season','team_id'])
df['game_num']=gd.cumcount() + 1

# merge game_num back into elo_data
elo_data=pd.merge(elo_data, df, how='left', left_on=['date','season','team1_id'], right_on=['date','season','team_id'])
elo_data.rename(columns={'game_num':'team1_game_num'}, inplace=True)
elo_data.drop(columns=['team_id'], inplace=True)

elo_data=pd.merge(elo_data, df, how='left', left_on=['date','season','team2_id'], right_on=['date','season','team_id'])
elo_data.rename(columns={'game_num':'team2_game_num'}, inplace=True)
elo_data.drop(columns=['team_id'], inplace=True)

In [24]:
# merge stats_df and elo_data
game_df = pd.merge(elo_data, stats_df.drop(columns=['date','opp_team_id']), how='left', 
                                           left_on=['season','team1_id','team1_game_num'], 
                                            right_on=['season','team_id','game_num'])
game_df.drop(columns=['game_num','team_id'], inplace=True)

col_names = ['points_l5', 'fg_attempted_l5', 'fg_made_l5',
                   'ft_attempted_l5', 'ft_made_l5', 'offensive_rebounds_l5',
                   'defensive_rebounds_l5', 'turnovers_l5',
                    'points_conceded_l5',
                   'fg_attempted_conceded_l5', 'fg_made_conceded_l5',
                   'ft_attempted_conceded_l5', 'ft_made_conceded_l5',
                   'offensive_rebounds_conceded_l5', 'defensive_rebounds_conceded_l5',
                   'turnovers_conceded_l5'
            ]
game_df.rename(columns={name:('t1_'+name) for name in col_names}, inplace=True)
game_df.columns

Index(['date', 'season', 'team1', 'team2', 'elo1_pre', 'elo2_pre', 'elo_prob1',
       'elo_prob2', 'elo1_post', 'elo2_post', 'score1', 'score2', 'team1_id',
       'team2_id', 'team1_game_num', 'team2_game_num', 't1_points_l5',
       't1_fg_attempted_l5', 't1_fg_made_l5', 't1_ft_attempted_l5',
       't1_ft_made_l5', 't1_offensive_rebounds_l5', 't1_defensive_rebounds_l5',
       't1_turnovers_l5', 't1_points_conceded_l5',
       't1_fg_attempted_conceded_l5', 't1_fg_made_conceded_l5',
       't1_ft_attempted_conceded_l5', 't1_ft_made_conceded_l5',
       't1_offensive_rebounds_conceded_l5',
       't1_defensive_rebounds_conceded_l5', 't1_turnovers_conceded_l5',
       'game_code'],
      dtype='object')

In [25]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16984 entries, 0 to 16983
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   date                               16984 non-null  object 
 1   season                             16984 non-null  int64  
 2   team1                              16984 non-null  object 
 3   team2                              16984 non-null  object 
 4   elo1_pre                           16984 non-null  float64
 5   elo2_pre                           16984 non-null  float64
 6   elo_prob1                          16984 non-null  float64
 7   elo_prob2                          16984 non-null  float64
 8   elo1_post                          16984 non-null  float64
 9   elo2_post                          16984 non-null  float64
 10  score1                             16984 non-null  float64
 11  score2                             16984 non-null  flo

In [26]:
game_df = pd.merge(game_df, stats_df.drop(columns=['date','game_code','opp_team_id']), how='left', 
                                        left_on=['season','team2_id','team2_game_num'], 
                                         right_on=['season','team_id','game_num'])

game_df.drop(columns=['game_num','team_id'], inplace=True)

game_df.rename(columns={name:('t2_'+name) for name in col_names}, inplace=True)
game_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16984 entries, 0 to 16983
Data columns (total 49 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   date                               16984 non-null  object 
 1   season                             16984 non-null  int64  
 2   team1                              16984 non-null  object 
 3   team2                              16984 non-null  object 
 4   elo1_pre                           16984 non-null  float64
 5   elo2_pre                           16984 non-null  float64
 6   elo_prob1                          16984 non-null  float64
 7   elo_prob2                          16984 non-null  float64
 8   elo1_post                          16984 non-null  float64
 9   elo2_post                          16984 non-null  float64
 10  score1                             16984 non-null  float64
 11  score2                             16984 non-null  flo

## 0.2 in-game team data

In [None]:
simple_query = f'''
with t1 as(
    select 
        CAST(season AS INTEGER) season,
        eventmetadata.gamecode game_code,
        eventmetadata.eventtypeid event_type_id,

        if(teammetadata[1].ishometeam, teammetadata[1].score, teammetadata[2].score ) home_final_score,
        if(teammetadata[1].ishometeam, teammetadata[2].score, teammetadata[1].score ) away_final_score,

        CAST(pbp.playid AS INTEGER) play_id, 
        pbp.period,
        GREATEST(4 - pbp.period, 0)*720 + pbp.time.minutes*60 + CAST(pbp.time.seconds AS double) time_left,

        pbp.teamid team_id,

        if((teammetadata[1].ishometeam and teammetadata[1].teamid=pbp.teamid) or
          (teammetadata[2].ishometeam and teammetadata[2].teamid=pbp.teamid), 1, 0) is_home_team,

        pbp.playevent.playeventid play_event_id, 
        pbp.playevent.name play_name, -- pbp.playtext play_text,

        if(pbp.playevent.playeventid in (1,2), 1, 0) ft_attempts,
        if(pbp.playevent.playeventid in (3,4), 1, 0) fg_attempts,
        if(pbp.playevent.playeventid in (5), 1, 0) offensive_rebounds,
        if(pbp.playevent.playeventid in (6), 1, 0) defensive_rebounds,
        if(pbp.playevent.playeventid in (7), 1, 0) turnover,

        pbp.homescore home_score, pbp.visitorscore away_score, 
        pbp.homefouls home_fouls, pbp.visitorfouls away_fouls
    from datalakebasketball.pbp
    where pbp.leagueid='1' 
        and pbp.teamid is not null
        and season='2004'
    order by season, game_code, play_id
)
  
select
    season, game_code, event_type_id, home_final_score, away_final_score,
    play_id, period, time_left, team_id, is_home_team, play_event_id,

    home_score, away_score,
    home_fouls, away_fouls,

    if(is_home_team=1, t1.ft_attempts, 0) home_ft_attempts,
    if(is_home_team=0, t1.ft_attempts, 0) away_ft_attempts,

    if(is_home_team=1, t1.fg_attempts, 0) home_fg_attempts,
    if(is_home_team=0, t1.fg_attempts, 0) away_fg_attempts,

    if(is_home_team=1, t1.offensive_rebounds, 0) home_offensive_rebounds,
    if(is_home_team=0, t1.offensive_rebounds, 0) away_offensive_rebounds,

    if(is_home_team=1, t1.defensive_rebounds, 0) home_defensive_rebounds,
    if(is_home_team=0, t1.defensive_rebounds, 0) away_defensive_rebounds,

    if(is_home_team=1, t1.turnover, 0) home_turnover,
    if(is_home_team=0, t1.turnover, 0) away_turnover
from t1
'''

if True:
    pbp_df = cursor.execute(simple_query).as_pandas()
    print(pbp_df.info())
else:
    print("Failed to query!")

# 1 Feature / label creation

In [161]:
num_fields = ['elo1_pre', 'elo2_pre', 
                    
                    't1_points_l5',
                    't1_fg_attempted_l5', 't1_fg_made_l5', 
                    't1_ft_attempted_l5', 't1_ft_made_l5', 
                    't1_offensive_rebounds_l5', 't1_defensive_rebounds_l5',
                    't1_turnovers_l5', 
                    
                    't1_points_conceded_l5',
                    't1_fg_attempted_conceded_l5', 't1_fg_made_conceded_l5',
                    't1_ft_attempted_conceded_l5', 't1_ft_made_conceded_l5',
                    't1_offensive_rebounds_conceded_l5', 't1_defensive_rebounds_conceded_l5', 
                    't1_turnovers_conceded_l5',
                    
                    't2_points_l5', 
                    't2_fg_attempted_l5', 't2_fg_made_l5',
                    't2_ft_attempted_l5', 't2_ft_made_l5', 
                    't2_offensive_rebounds_l5', 't2_defensive_rebounds_l5', 
                    't2_turnovers_l5', 
                    
                    't2_points_conceded_l5',
                    't2_fg_attempted_conceded_l5', 't2_fg_made_conceded_l5',
                    't2_ft_attempted_conceded_l5', 't2_ft_made_conceded_l5',
                    't2_offensive_rebounds_conceded_l5', 't2_defensive_rebounds_conceded_l5', 
                    't2_turnovers_conceded_l5'
             ]
cat_fields = []

In [162]:
id_train = (game_df.season < 2018)  #& (pbp_df.period <= 4) #| (win_df.week < 22)) 
id_test  = (game_df.season == 2018) #& (pbp_df.period <= 4) # & (~win_df['winProbability.before'].isnull())

# None-StandardScaler version
transform_pipeline = ColumnTransformer(transformers=[
                                            ('num', StandardScaler(), num_fields),
                                            #('cat', OneHotEncoder(categories='auto'), cat_fields)
                                        ])

features_train = transform_pipeline.fit_transform(game_df[id_train])
features_test  = transform_pipeline.transform(game_df[id_test])

feature_names = num_fields.copy()
# cat_one_hot_fields = list(transform_pipeline_drive.named_transformers_.cat.get_feature_names(input_features=cat_fields_drive))
# feature_names_drive.extend(cat_one_hot_fields)

print(features_train.shape, features_test.shape)

(15754, 34) (1230, 34)


In [151]:
# for non-parametric modeling -> label needs to be put into numerical categories
label = game_df.score1 - game_df.score2
label = label.astype('int32')
label.loc[label > 20] = 20
label.loc[label < -20] = -20
label[label>0] = label[label>0] - 1
label_train = label[id_train]
lable_test  = label[id_test]

In [152]:
label.describe()
np.sort(label.unique())

array([-20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10,  -9,  -8,
        -7,  -6,  -5,  -4,  -3,  -2,  -1,   0,   1,   2,   3,   4,   5,
         6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,
        19], dtype=int32)

# 2 Model Study

In [220]:
# instantiate models
folds = 5

model_logistic = LogisticRegression(solver='lbfgs', fit_intercept=True, C=0.1, max_iter=100000, tol=1e-5, random_state=42)

# depth of 10 seems better than 15
model_RFC = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=5, random_state=42)

model_ordinal = LogisticAT(alpha=100)  # alpha parameter set to zero to perform no regularisation

model_linear = SGDRegressor(max_iter=1000, tol=1e-3)

#model_SGD = SGDClassifier(loss='log', penalty=None, fit_intercept=True, max_iter=10000, tol=1e-4, random_state=42)

#model_SVC = LinearSVC(max_iter=10000, tol=1e-4, random_state=42)

model_RFR = RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_leaf=10, random_state=42)

In [134]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_DL_model():
    model_DL = keras.models.Sequential([
        keras.layers.Flatten(input_shape=[34,]),
        keras.layers.Dropout(rate=0.2),
        keras.layers.BatchNormalization(),
        #keras.layers.Dense(250, activation='elu', kernel_initializer="he_normal"),
        #keras.layers.Dropout(rate=0.2),
        #keras.layers.BatchNormalization(),
        keras.layers.Dense(150, activation='elu', kernel_initializer="he_normal"),
        keras.layers.Dropout(rate=0.2),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(80, activation='elu', kernel_initializer="he_normal"),
        keras.layers.Dropout(rate=0.2),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(30, activation='elu', kernel_initializer="he_normal"),
        keras.layers.Dropout(rate=0.2),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(40, activation='softmax')
    ])

    # multi-nomial classification
    #model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['AUC'])

    # binary classification
    optimizer = keras.optimizers.SGD(learning_rate=0.01, decay=1e-4)
    model_DL.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=[['mae']])

    #model_DL.summary()
    
    return model_DL

model_DL = KerasClassifier(build_fn=create_DL_model, epochs=200, verbose=0)

In [163]:
(features_train).shape

(15754, 34)

In [217]:
MAE_linear = cross_validate(model_linear,
    features_train,
    label_train,
    cv=folds,
    scoring=['neg_mean_absolute_error'])
print('Linear regression: {:.3f}'.format(-MAE_linear['test_neg_mean_absolute_error'].mean()) )

Linear regression: 8.274


In [165]:
# Random forest classification

scores_RFC = cross_validate(model_RFC,
    features_train,
    label_train,
    cv=folds,
    scoring=(['neg_mean_absolute_error']) )
print('Random Forest Classifier (mae): {:.3}'.format(-scores_RFC['test_neg_mean_absolute_error'].mean()) )

Random Forest Classifier (mae): 14.4


In [160]:
# Random forest classification

scores_RFR = cross_validate(model_RFR,
    features_train,
    label_train,
    cv=folds,
    scoring=(['neg_mean_absolute_error']))
print('Random Forest Regressor (mae): {:.3}'.format(-scores_RFC['test_neg_mean_absolute_error'].mean() ) )

Random Forest Regressor (mae): 11.7


In [221]:
MAE = make_scorer(mean_absolute_error)

MAE_ordinal = cross_validate(model_ordinal,
    features_train,
    label_train,
    cv=folds,
    scoring=['neg_mean_absolute_error'])
print('Ordered logistic regression (mae): {:.3}'.format(-MAE_ordinal['test_neg_mean_absolute_error'].mean() ) )

Ordered logistic regression (mae): 8.24


In [97]:
print('Ordered logistic regression (mae/accuracy): {:.3f}, {:.3f}'.format(
                                                    MAE_ordinal['test_neg_mean_absolute_error'].mean(), 
                                                    MAE_ordinal['test_accuracy'].mean()) )

Ordered logistic regression (mae/accuracy): -7.014, 0.056


In [None]:
# does the prediction from ordinal regression fall into the range?

In [159]:
# Deep learning classification

scores_DL = cross_validate(model_DL,
    features_train,
    label_train,
    cv=folds,
    scoring=(['neg_mean_absolute_error']))

# print('Win Probability - Random Forest Classifier (P/R/F1/ROC): {:.2%}  {:.2%}  {:.2%} {:.2%}'.format(
#                                                                                 #scores_DL['test_recall'].mean(), 
#                                                                                 #scores_DL['test_precision'].mean(), 
#                                                                                 #scores_DL['test_f1'].mean(),
#                                                                                 0.0,0.0,0.0,
#                                                                                 scores_DL['test_roc_auc'].mean()))

Traceback (most recent call last):
  File "/Users/binhu/projects/nba-in-game-prediction-models/venv/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/binhu/projects/nba-in-game-prediction-models/venv/lib/python3.7/site-packages/tensorflow/python/keras/wrappers/scikit_learn.py", line 223, in fit
    return super(KerasClassifier, self).fit(x, y, **kwargs)
  File "/Users/binhu/projects/nba-in-game-prediction-models/venv/lib/python3.7/site-packages/tensorflow/python/keras/wrappers/scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "/Users/binhu/projects/nba-in-game-prediction-models/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 709, in fit
    shuffle=shuffle)
  File "/Users/binhu/projects/nba-in-game-prediction-models/venv/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 2651,

In [123]:
scores_DL['test_neg_mean_absolute_error'].mean()

-8.121318543369956

In [158]:
# feature importance study
model_RFC.fit(features_train, label_train)
feature_score = pd.DataFrame([feature_names, model_RFC.feature_importances_], index=['feature','importance']).transpose()
feature_score.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,elo1_pre,0.50208
1,elo2_pre,0.49792


# 2 Model Prediction

In [142]:
#model_ordinal.fit(features_train, label_train)
pred_p = model_ordinal.predict_proba(features_test)
pred = model_ordinal.predict(features_test)
#print(roc_auc_score(label_test, pred_p) 

In [147]:
pred_p[0]

array([0.04248696, 0.00612527, 0.00805544, 0.00970406, 0.01069702,
       0.01109331, 0.0140877 , 0.01428915, 0.01783006, 0.02150566,
       0.02407461, 0.02580026, 0.03236901, 0.03402594, 0.03461894,
       0.03426581, 0.03219989, 0.03026961, 0.03122289, 0.02391425,
       0.0240668 , 0.03185126, 0.03717868, 0.03500574, 0.04162618,
       0.03591087, 0.04101922, 0.03467343, 0.03173158, 0.02632733,
       0.02474576, 0.02106616, 0.02052151, 0.01539204, 0.01559677,
       0.01478939, 0.01086678, 0.0101273 , 0.00851231, 0.06035503])