Business case:
Sports betting is a growing industry, and there is large amounts of money flowing around. This model attempts to predict which team would win in an NBA game and provide an upperhand to the "house", or the sports betting company. 

<h1>Data Processing</h1>

Imports

In [1]:
import pandas as pd

Read games data

In [2]:
games = pd.read_csv('nba_games/games.csv')

# filter to the last 2 seasons
games = games[games['SEASON'] > 2020]
games['GAME_DATE_EST'] = pd.to_datetime(games['GAME_DATE_EST'])

# build day of year, week of year, and month of year
for idx, row in games.iterrows():
    games.loc[idx, 'DAY'] = row['GAME_DATE_EST'].timetuple().tm_yday
    games.loc[idx, 'WEEK'] = row['GAME_DATE_EST'].isocalendar().week
    games.loc[idx, 'MONTH'] = row['GAME_DATE_EST'].month


Read game details set and merge with games set. Define separaet dfs for home team and visiting team

In [3]:
game_details = pd.read_csv('nba_games/games_details.csv')
df_home = pd.merge(game_details, games, left_on=["GAME_ID", "TEAM_ID"], right_on=['GAME_ID', "HOME_TEAM_ID"], how="inner")
df_visit = pd.merge(game_details, games, left_on=["GAME_ID", "TEAM_ID"], right_on=['GAME_ID', "VISITOR_TEAM_ID"], how="inner")

  game_details = pd.read_csv('nba_games/games_details.csv')


Check for nulls and find minimum number of players per team, minimum number of players to start a NBA game is 9

In [4]:
player_teams_df = df_home.groupby(['GAME_ID', 'TEAM_ID']).agg({'PLAYER_ID': ['count']})
print(player_teams_df[player_teams_df.isnull().any(axis=1)])
print(player_teams_df.min())

player_teams_df = df_visit.groupby(['GAME_ID', 'TEAM_ID']).agg({'PLAYER_ID': ['count']})
print(player_teams_df[player_teams_df.isnull().any(axis=1)])
print(player_teams_df.min())


Empty DataFrame
Columns: [(PLAYER_ID, count)]
Index: []
PLAYER_ID  count    10
dtype: int64
Empty DataFrame
Columns: [(PLAYER_ID, count)]
Index: []
PLAYER_ID  count    9
dtype: int64


Calculate seconds played per player per game per team.
Not using score because some players are important to the game through assists and such instead of scoring

In [5]:
def calculate_seconds_played(df):
    for idx, row in df.iterrows():
        minutes = row['MIN']
        seconds = 0
        if (isinstance(minutes, str)):
            m, s = minutes.split(":")
            seconds = int(float(m) * 60 + float(s))
        df.loc[idx, 'SEC'] = seconds
    return df

df_home = calculate_seconds_played(df_home)
df_visit = calculate_seconds_played(df_visit)


Filter dataset features

In [6]:
df_home = df_home[['GAME_ID', 'HOME_TEAM_ID', 'PLAYER_ID', 'SEC']]
df_visit = df_visit[['GAME_ID', 'VISITOR_TEAM_ID', 'PLAYER_ID', 'SEC']]


Pivot player IDs into rows

In [7]:
def pivot_players(df, home):
    gb = df.groupby(['GAME_ID', f"{home}_TEAM_ID"])
    game_teams = [gb.get_group(x) for x in gb.groups]
    
    df_pivoted = pd.DataFrame()
    
    for x in game_teams:
        tmp = x.nlargest(9, 'SEC')
        num = 1
        for idx, row in tmp.iterrows():
            tmp.loc[idx, 'SEC'] = f"{home}_P{num}"
            num += 1
            
        tmp.rename(columns={'SEC':'NUM'}, inplace=True)
        tmp = pd.pivot_table(tmp, index=['GAME_ID', f"{home}_TEAM_ID"], columns='NUM', values='PLAYER_ID', aggfunc= lambda r:r)
        df_pivoted = pd.concat([df_pivoted, tmp])
    
    return df_pivoted
    
df_home_pivoted = pivot_players(df_home, "HOME")
df_visit_pivoted = pivot_players(df_visit, "VISITOR")

Save pivoted data

In [8]:
df_visit_pivoted.to_csv("visit_pivoted.csv")
df_home_pivoted.to_csv("home_pivoted.csv")

Merge team sets and also merge game info like data and who won

In [9]:
home = pd.read_csv("home_pivoted.csv")
visit = pd.read_csv("visit_pivoted.csv")

games_filter = games[['GAME_ID', 'DAY', 'WEEK', 'MONTH', 'HOME_TEAM_WINS']]

df_data = pd.merge(home, visit, on=['GAME_ID'], how='outer')
df_data = pd.merge(df_data, games_filter, on='GAME_ID')

Save final processed data

In [10]:
df_data.to_csv("processed.csv")

<h1>Pipeline and Parameter Tuning</h1>

Imports

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression


from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import pandas as pd 
import os


Various ways to ignore warnings that clog up the output

In [12]:
import warnings
warnings.filterwarnings('ignore') 
os.environ['PYTHONWARNINGS']='ignore'

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Read preprocessed data, setup numerical and categorical features and transformers, then build pipeline

In [13]:
dataframe = pd.read_csv("processed.csv")
features = dataframe.columns.to_list()
features.remove('Unnamed: 0')
features.remove('HOME_TEAM_WINS')

data = dataframe.values
X = dataframe[features]
y = dataframe.HOME_TEAM_WINS

numeric_features = ['WEEK'] #week of year
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_features = ['GAME_ID',
            'HOME_TEAM_ID',
            # 'HOME_P1',
            # 'HOME_P2',
            # 'HOME_P3',
            # 'HOME_P4',
            # 'HOME_P5',
            # 'HOME_P6',
            # 'HOME_P7',
            # 'HOME_P8',
            # 'HOME_P9',
            'VISITOR_TEAM_ID',
            # 'VISITOR_P1',
            # 'VISITOR_P2',
            # 'VISITOR_P3',
            # 'VISITOR_P4',
            # 'VISITOR_P5',
            # 'VISITOR_P6',
            # 'VISITOR_P7',
            # 'VISITOR_P8',
            # 'VISITOR_P9',
            ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),
    ("cat_transform", categorical_transformer, categorical_features)
])
pipeline = Pipeline(steps=[("preprocesser", preprocessor), ("classifier", LogisticRegression())])
pipeline



Split test data and perform base run

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(score)

0.6124031007751938


Define parameter grid for grid serach

In [15]:
param_grid = [
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__C": [0.1, 1.0, 10.0, 100.0],
        "classifier": [LogisticRegression()]
    },
    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestClassifier()]
    },
]

Search grid and see best scores!

In [16]:
for classifier in param_grid:
    
    grid_search = GridSearchCV(pipeline, classifier, cv=10, verbose=1,n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best params:")
    print(grid_search.best_params_)
    print("Best score in grid search:")
    print(grid_search.best_score_)
    print("Best logistic regression from grid search:")
    print(grid_search.score(X_test, y_test))
    print()

#0.6459948320413437
#0.627906976744186

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best params:
{'classifier': LogisticRegression(C=100.0), 'classifier__C': 100.0, 'preprocesser__num_transform__imputer__strategy': 'mean'}
Best score in grid search:
0.6334059488898199
Best logistic regression from grid search:
0.6149870801033591

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best params:
{'classifier': RandomForestClassifier(), 'classifier__n_estimators': 100, 'preprocesser__num_transform__imputer__strategy': 'median'}
Best score in grid search:
0.6256933389191454
Best logistic regression from grid search:
0.6098191214470284



 Best score we saw was 0.6459948320413437, but usually around 0.627 depending on how the test data was split