In [40]:

import joblib
import lightgbm as lgb
import numpy as np
from sklearn.linear_model import SGDRegressor
import optuna
import pandas as pd
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import SGDClassifier
import sqlite3


In [41]:
def season_to_string(x):
    return str(x) + '-' + str(x+1)[-2:]

def get_data_from_db(target, db_filepath, test_season):
    
        
    test_season_str = season_to_string(test_season)
    
    connection = sqlite3.connect(db_filepath)

    df = pd.read_sql('SELECT * FROM team_stats_ewa_matchup', con=connection)
    df = df.drop(columns=['index'])
    connection.close()

    df = df.sort_values('GAME_DATE')

    df = df.dropna()

    columns_to_drop = ['SEASON', 'HOME_TEAM_ABBREVIATION', 'GAME_DATE', 'GAME_ID', 'MATCHUP',
                        'HOME_HOME_GAME', 'HOME_TEAM_SCORE', 'HOME_ML', 'HOME_SPREAD',
                        'HOME_ATS_DIFF', 'HOME_TEAM_COVERED', 'HOME_POINT_DIFF',
                        'HOME_WL', 'AWAY_ML', 'AWAY_TEAM_SCORE',
                        'HOME_PTS_L5', 'HOME_PTS_L10', 'HOME_PTS_L20',
                        'HOME_PLUS_MINUS_L5', 'HOME_PLUS_MINUS_L10', 'HOME_PLUS_MINUS_L20',
                        'HOME_NET_RATING_L5', 'HOME_NET_RATING_L10', 'HOME_NET_RATING_L20',
                        'HOME_POSS_L5', 'HOME_POSS_L10', 'HOME_POSS_L20',
                        'HOME_PTS_opp_L5', 'HOME_PTS_opp_L10', 'HOME_PTS_opp_L20',
                        'HOME_PLUS_MINUS_opp_L5', 'HOME_PLUS_MINUS_opp_L10', 'HOME_PLUS_MINUS_opp_L20',
                        'HOME_NET_RATING_opp_L5', 'HOME_NET_RATING_opp_L10', 'HOME_NET_RATING_opp_L20',
                        'HOME_POSS_opp_L5', 'HOME_POSS_opp_L10', 'HOME_POSS_opp_L20',
                        'HOME_REB_L5', 'HOME_REB_L10', 'HOME_REB_L20',  
                        'HOME_REB_opp_L5', 'HOME_REB_opp_L10', 'HOME_REB_opp_L20',       
                        'AWAY_PTS_L5', 'AWAY_PTS_L10', 'AWAY_PTS_L20',
                        'AWAY_PLUS_MINUS_L5', 'AWAY_PLUS_MINUS_L10', 'AWAY_PLUS_MINUS_L20',
                        'AWAY_NET_RATING_L5', 'AWAY_NET_RATING_L10', 'AWAY_NET_RATING_L20',
                        'AWAY_POSS_L5', 'AWAY_POSS_L10', 'AWAY_POSS_L20',
                        'AWAY_PTS_opp_L5', 'AWAY_PTS_opp_L10', 'AWAY_PTS_opp_L20',
                        'AWAY_PLUS_MINUS_opp_L5', 'AWAY_PLUS_MINUS_opp_L10', 'AWAY_PLUS_MINUS_opp_L20',
                        'AWAY_NET_RATING_opp_L5', 'AWAY_NET_RATING_opp_L10', 'AWAY_NET_RATING_opp_L20',
                        'AWAY_POSS_opp_L5', 'AWAY_POSS_opp_L10', 'AWAY_POSS_opp_L20',
                        'AWAY_REB_L5', 'AWAY_REB_L10', 'AWAY_REB_L20',
                        'AWAY_REB_opp_L5', 'AWAY_REB_opp_L10', 'AWAY_REB_opp_L20']

    train_df = df.loc[df['SEASON'] < test_season_str]
    test_df = df.loc[df['SEASON'] >= test_season_str]

    X_train = train_df.drop(columns=columns_to_drop)
    y_train = train_df[target]

    X_test = test_df.drop(columns=columns_to_drop)
    y_test = test_df[target]
    
    return X_train, X_test, y_train, y_test, train_df, test_df
    
    
def tscv_by_season(train_df, test_season = 2021):
    earliest_year_with_data = 2013
    min_training_years = 3

    cv_splits = []
 
    for year in range(earliest_year_with_data + min_training_years, test_season):      
        listTrain = train_df.loc[train_df['SEASON'] < season_to_string(year)].index
        listVal = train_df.loc[train_df['SEASON'] == season_to_string(year)].index        
        cv_splits.append((listTrain, listVal))

    return cv_splits      




In [55]:
db_filepath = Path.home().joinpath('NBA_model_v1', 'data', 'nba.db')
hyperparameter_filepath_out = Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning')

connection = sqlite3.connect(db_filepath)
X_train, X_test, y_train, y_test, train_df, test_df = get_data_from_db(target='HOME_WL', db_filepath=db_filepath, test_season=2021)
connection.close()

cv_splits = tscv_by_season(train_df, test_season = 2021)


study_name = str(Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning', 'LGBMClassifier'))
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.load_study(study_name = study_name, 
                            storage = storage_name)

params = study.best_params
lgbc = lgb.LGBMClassifier(**params)

study_name = str(Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning', 'SGDClassifierHinge_WinPredictor'))
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.load_study(study_name = study_name, 
                            storage = storage_name)

params = study.best_params
sgd_hinge = Pipeline([('scaler', StandardScaler()),
                ('sgd', SGDClassifier(**params,
                        shuffle=False,
                        random_state=23))])          
   
study_name = str(Path.home().joinpath('NBA_model_v1', 'models', 'hyperparameter_tuning', 'SGDClassifierLogLoss_WinPredictor'))
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.load_study(study_name = study_name, 
                            storage = storage_name)

params = study.best_params
sgd_logloss = Pipeline([('scaler', StandardScaler()),
                ('sgd', SGDClassifier(**params,
                                      loss='log_loss',
                                        shuffle=False,
                                        random_state=23))])             


In [56]:

## implement cross-val-prediction
cross_val_predict = np.row_stack([
    np.column_stack([
        lgbc.fit(X_train.iloc[id_train], y_train.iloc[id_train]).predict_proba(X_train.iloc[id_test]),
        sgd_hinge.fit(X_train.iloc[id_train], y_train.iloc[id_train]).decision_function(X_train.iloc[id_test]),
        sgd_logloss.fit(X_train.iloc[id_train], y_train.iloc[id_train]).predict_proba(X_train.iloc[id_test]),

        y_train.iloc[id_test].values
    ])
    for id_train, id_test in cv_splits
])



In [57]:
lgbc.fit(X_train, y_train)
sgd_hinge.fit(X_train, y_train)
sgd_logloss.fit(X_train, y_train)

(6141, 6)

In [65]:
pd.DataFrame(cross_val_predict)

Unnamed: 0,0,1,2,3,4,5
0,0.781840,0.218160,-2.115541,0.917376,0.082624,0.0
1,0.594992,0.405008,0.832013,0.396868,0.603132,1.0
2,0.737293,0.262707,-1.070229,0.697020,0.302980,1.0
3,0.139505,0.860495,4.345256,0.012195,0.987805,1.0
4,0.439899,0.560101,0.868259,0.270190,0.729810,0.0
...,...,...,...,...,...,...
6136,0.623667,0.376333,-0.934718,0.746453,0.253547,1.0
6137,0.410235,0.589765,0.864563,0.323944,0.676056,1.0
6138,0.491780,0.508220,0.866656,0.297271,0.702729,1.0
6139,0.347287,0.652713,0.044264,0.515413,0.484587,0.0


In [66]:
stacking = LogisticRegression()

stacking.fit(cross_val_predict[:, :-1], cross_val_predict[:, -1])

In [71]:
stacking_preds = stacking.predict(
    np.column_stack([
        lgbc.predict_proba(X_test),
        sgd_hinge.decision_function(X_test),
        sgd_logloss.predict_proba(X_test)
    ])
)

In [72]:
stacking_preds

array([0., 1., 1., ..., 0., 1., 1.])

In [74]:
from sklearn.metrics import accuracy_score
accuracy_score(stacking_preds, y_test)

0.679902755267423

In [39]:
stacking.coef_[0]

array([ 0.23982098,  0.12697018,  0.82236713, -0.07045641])