In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
df = pd.read_csv('nba_games/games.csv')
df.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,...,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,...,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,...,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,...,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,...,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


Filtering to games only in the last 2 years because teams change and previous games would not represent current teams

In [4]:
df['GAME_DATE_EST'] = pd.to_datetime(df['GAME_DATE_EST'])
df_filtered = df[df['GAME_DATE_EST'] > '2021-01-01']
df_filtered.head()


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,...,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,...,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,...,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610612755,113.0,0.441,0.909,...,27.0,49.0,1610612765,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610612737,108.0,0.429,1.0,...,22.0,47.0,1610612741,110.0,0.5,0.773,0.292,20.0,47.0,0


Filtering out columns we do not need and renaming the columns we need

In [62]:

for idx, row in df_filtered.iterrows():
    df_filtered.loc[idx, 'DAY_OF_YEAR'] = row['GAME_DATE_EST'].timetuple().tm_yday

df_data = df_filtered[['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'HOME_TEAM_WINS', 'DAY_OF_YEAR']]
# df_data.rename(columns={'team_abbreviation_home':'home', "away":"away", "game_date":"date", "wl_home":"wl_home"}, inplace=True)
df_data.head()

Unnamed: 0,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM_WINS,DAY_OF_YEAR
0,1610612740,1610612759,1,356
1,1610612762,1610612764,1,356
2,1610612739,1610612749,1,355
3,1610612755,1610612765,1,355
4,1610612737,1610612741,0,355


Checking for missing values

In [63]:
# df_data.at[62312,'wl_home']='L'
# df_data.at[62313,'wl_home']='W'
# df_data.at[62314,'wl_home']='W'
# df_data.at[62315,'wl_home']='L'
# df_data.at[62316,'wl_home']='L'
# df_data.at[62317,'wl_home']='W'
df_data[df_data.isnull().any(axis=1)]



Unnamed: 0,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM_WINS,DAY_OF_YEAR


Look for dupicate data, found none

In [64]:
dups = df_data.duplicated()
dups.any()

True

Baseline frequency

In [65]:
df_data.HOME_TEAM_WINS.value_counts(normalize=True)

1    0.558678
0    0.441322
Name: HOME_TEAM_WINS, dtype: float64

Pipeline

In [66]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
ohe = OneHotEncoder(sparse=False)

In [75]:
features = ['HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'DAY_OF_YEAR']
X = df_data[features]
y = df_data.HOME_TEAM_WINS

In [68]:
ct = make_column_transformer(
       (ohe, features),
       remainder = 'passthrough'
)
a = ct.fit_transform(X)
a[:5, : ]



array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [69]:
X.head(5)

Unnamed: 0,HOME_TEAM_ID,VISITOR_TEAM_ID,DAY_OF_YEAR
0,1610612740,1610612759,356
1,1610612762,1610612764,356
2,1610612739,1610612749,355
3,1610612755,1610612765,355
4,1610612737,1610612741,355


Cross Validation

In [70]:
from sklearn.pipeline import make_pipeline

In [76]:
from sklearn.linear_model import LogisticRegression

ct = make_column_transformer(
       (OneHotEncoder(handle_unknown='ignore'), features),
       remainder = 'passthrough'
)

lgr = LogisticRegression()

p1 = make_pipeline(ct, lgr)

X = df_data[features]
y = df_data.HOME_TEAM_WINS


from sklearn.model_selection import cross_val_score

cross_val_score( p1, X , y, cv=5, scoring='accuracy').mean()

0.5887603305785123