# Baseline Logistic Regression 

In [13]:
import pandas as pd 
import numpy as np 

## Setting Up the Data 

In [14]:
df = pd.read_csv('modeling_data_sample.csv')
print(df.shape)
df.head()

(150, 18)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT,PO_WINS,TEAM_ID,TEAM_NAME,PO_WINS_CAT
0,1610612737_2019-20,19.856225,0.517043,0.25218,0.149825,0.226813,0.52662,0.313881,0.133524,0.235405,78.8125,217.25,3.6875,0.354,0.0,1610612737,Hawks,R1/Lottery
1,1610612737_2020-21,24.459721,0.519798,0.257725,0.148807,0.214129,0.536863,0.277694,0.14251,0.234664,78.235294,216.058824,3.823529,0.299,10.0,1610612737,Hawks,Conf. Finals
2,1610612737_2021-22,19.359828,0.527291,0.280572,0.134667,0.223747,0.53592,0.260534,0.126992,0.220961,78.352941,212.882353,4.411765,0.569,1.0,1610612737,Hawks,R1/Lottery
3,1610612737_2022-23,19.634129,0.515657,0.252946,0.140765,0.214893,0.53273,0.251615,0.131051,0.222704,78.352941,212.705882,2.823529,0.524,2.0,1610612737,Hawks,R1/Lottery
4,1610612737_2023-24,17.248412,0.555474,0.240233,0.135582,0.279755,0.537575,0.270382,0.132214,0.209344,78.352941,212.705882,3.823529,0.5,,1610612737,Hawks,


In [15]:
def modeling_data_setup(data,test_seasons,this_season,model_type):
    # returns x, xtest, y, ytest, this season's data 
    
    import pandas as pd
    import numpy as np 
    
    if type(test_seasons) == int:
        test_seasons = [test_seasons]
    elif type(test_seasons) == list:
        test_seasons = test_seasons
    else:
        raise TypeError("ERROR! Please enter the test_seasons as type int or list (of int)")
        
    for i in test_seasons:
        if type(i)!=int:
            raise TypeError("ERROR! Please enter the test_seasons as type int or list (of int)")
    
    import pandas as pd
    import numpy as np 
    from sklearn.model_selection import train_test_split
    
    print('columns to drop:')
    drop_cols = ['TEAM_SEASON','TEAM_ID','TEAM_NAME','PO_WINS','PO_WINS_CAT','SEASON']
    print(drop_cols)
    
    print('Model Type: {}'.format(model_type))
    if model_type == 'cat':
        target = 'PO_WINS_CAT'
    elif model_type == 'reg':
        target = 'PO_WINS'
    else:
        raise TypeError("ERROR! Enter model_type! Options: 'cat' or 'reg'")
    
    print('Creating SEASON for splitting')
    ids = data['TEAM_SEASON'].str.split('_')
    season = [int(x[1][0:4]) for x in ids]
    data['SEASON'] = season
    
    print("Dropping this season: {}".format(this_season))
    current = data[data['SEASON']==this_season].copy()
    
    test = data[data['SEASON'].isin(test_seasons)].copy()
    train = data[(~data['SEASON'].isin(test_seasons))&(data['SEASON']!=this_season)].copy()
    ytest = test[target].copy()
    ytrain = train[target].copy()
    xtest = test.drop(drop_cols,axis=1)
    xtrain = train.drop(drop_cols,axis=1)
    
    print("Returning: Train, Test, X Train, X Test, Y Train, Y Test, and the Upcoming Season")
    
    return train,test,xtrain,xtest,ytrain,ytest,current

In [16]:
train, test, x_train, x_test, y_train, y_test, upcoming = modeling_data_setup(df,[2022,2021],2023,'cat')

columns to drop:
['TEAM_SEASON', 'TEAM_ID', 'TEAM_NAME', 'PO_WINS', 'PO_WINS_CAT', 'SEASON']
Model Type: cat
Creating SEASON for splitting
Dropping this season: 2023
Returning: Train, Test, X Train, X Test, Y Train, Y Test, and the Upcoming Season


In [17]:
id_cols = ['TEAM_SEASON','TEAM_ID','TEAM_NAME','SEASON']
train_ids = train[id_cols].copy()
test_ids = test[id_cols].copy()

In [18]:
test

Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT,PO_WINS,TEAM_ID,TEAM_NAME,PO_WINS_CAT,SEASON
2,1610612737_2021-22,19.359828,0.527291,0.280572,0.134667,0.223747,0.53592,0.260534,0.126992,0.220961,78.352941,212.882353,4.411765,0.569,1.0,1610612737,Hawks,R1/Lottery,2021
3,1610612737_2022-23,19.634129,0.515657,0.252946,0.140765,0.214893,0.53273,0.251615,0.131051,0.222704,78.352941,212.705882,2.823529,0.524,2.0,1610612737,Hawks,R1/Lottery,2022
7,1610612738_2021-22,16.735291,0.518181,0.267986,0.141405,0.220962,0.543561,0.296705,0.137863,0.224751,78.588235,220.411765,3.411765,0.5,14.0,1610612738,Celtics,Finals,2021
8,1610612738_2022-23,18.103004,0.528067,0.247259,0.156329,0.202154,0.56431,0.304635,0.121419,0.206043,79.0,224.941176,5.705882,0.622,11.0,1610612738,Celtics,Conf. Finals,2022
12,1610612739_2021-22,16.866016,0.503766,0.286899,0.1563,0.212856,0.55173,0.238183,0.14274,0.214377,78.411765,215.764706,4.176471,0.306,0.0,1610612739,Cavaliers,R1/Lottery,2021
13,1610612739_2022-23,21.265765,0.52013,0.251899,0.144185,0.228387,0.526753,0.241594,0.134181,0.226984,78.176471,218.588235,4.882353,0.537,1.0,1610612739,Cavaliers,R1/Lottery,2022
17,1610612740_2021-22,16.055873,0.515607,0.291061,0.142382,0.216543,0.533346,0.28884,0.137723,0.210508,77.647059,213.411765,3.882353,0.431,2.0,1610612740,Pelicans,R1/Lottery,2021
18,1610612740_2022-23,20.464109,0.513665,0.289696,0.141893,0.26913,0.54967,0.260502,0.144036,0.213243,78.588235,214.058824,4.058824,0.439,0.0,1610612740,Pelicans,R1/Lottery,2022
22,1610612741_2021-22,15.791119,0.533892,0.248171,0.145705,0.232035,0.526181,0.278566,0.140384,0.204177,78.529412,216.647059,4.0,0.431,1.0,1610612741,Bulls,R1/Lottery,2021
23,1610612741_2022-23,19.057008,0.532817,0.329814,0.136651,0.210463,0.551151,0.282253,0.136631,0.224108,78.058824,212.0,4.705882,0.561,0.0,1610612741,Bulls,R1/Lottery,2022


## Modeling

In [19]:
# basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# for feature reduction 
from sklearn import feature_selection
from sklearn import pipeline

# for the modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# for the evaluation 
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy.stats import ks_2samp

# getting rid of the warnings 
import warnings
warnings.filterwarnings('ignore')

In [20]:
logreg = LogisticRegression()

In [21]:
# parameter grid
parameters = {
    'max_iter': [100,1000,2000],
    'class_weight': [None],
    'penalty': ['l1','l2'], 
    'C': np.linspace(start=.01,stop=1,num=50,endpoint=True),
    'solver': ['liblinear','saga']   
}

clf = GridSearchCV(logreg, # model
                   param_grid = parameters, # hyperparameters
                   scoring='f1_macro', # metric for scoring
                   cv=5, # number of folds
                   n_jobs=-1,) # parallel

In [22]:
# Fitting the model 
clf.fit(x_train,y_train)

print("Tuned Hyperparameters :", clf.best_params_)
print("Best Score :",clf.best_score_)

Tuned Hyperparameters : {'C': 0.333265306122449, 'class_weight': None, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score : 0.3598690353334316


In [23]:
# storing the CV results 
cv_results = pd.DataFrame(clf.cv_results_)

## Evaluation

### Train 

In [24]:
predictions = clf.predict(x_train)

In [25]:
matrix = confusion_matrix(y_train,predictions)

In [26]:
pd.crosstab(y_train, predictions, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,R1/Lottery,Second Round,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Champions,1,1,2
Conf. Finals,3,1,4
Finals,2,0,2
R1/Lottery,43,1,44
Second Round,6,2,8
All,55,5,60


In [27]:
report = classification_report(y_train, predictions)
print(report)

              precision    recall  f1-score   support

   Champions       0.00      0.00      0.00         2
Conf. Finals       0.00      0.00      0.00         4
      Finals       0.00      0.00      0.00         2
  R1/Lottery       0.78      0.98      0.87        44
Second Round       0.40      0.25      0.31         8

    accuracy                           0.75        60
   macro avg       0.24      0.25      0.24        60
weighted avg       0.63      0.75      0.68        60



In [28]:
f1_micro = metrics.f1_score(y_train,predictions,average='micro')
f1_macro = metrics.f1_score(y_train,predictions,average='macro')
f1_weighted = metrics.f1_score(y_train,predictions,average='weighted')

In [33]:
y_pred_proba = clf.predict_proba(x_train)[::,1]

In [34]:
train_output = pd.DataFrame({'Target':y_train,'prob':y_pred_proba})
metrics_list = ['F1 Micro','F1 Macro','F1 Weighted']
values = [f1_micro,f1_macro,f1_weighted]
train_results = pd.DataFrame({'METRIC':metrics_list,'VALUE':values})
train_results = train_results.transpose().reset_index(drop=True)
train_results.columns = train_results.iloc[0,:]
train_results = train_results.tail(1)

In [35]:
train_output.head()

Unnamed: 0,Target,prob
0,R1/Lottery,0.066443
1,Conf. Finals,0.154801
5,Conf. Finals,0.077517
6,R1/Lottery,0.061295
10,R1/Lottery,0.077675


In [36]:
train_results

Unnamed: 0,F1 Micro,F1 Macro,F1 Weighted
1,0.75,0.235276,0.678063


### Test 

In [37]:
predictions = clf.predict(x_test)

In [38]:
predictions

array(['R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'Second Round',
       'Second Round', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'Second Round', 'R1/Lottery', 'R1/Lottery',
       'Second Round', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery',
       'R1/Lottery', 'R1/Lottery', 'R1/Lottery', 'R1/Lottery'],
      dtype=object)

In [40]:
matrix = confusion_matrix(y_test,predictions)

In [41]:
pd.crosstab(y_test, predictions, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,R1/Lottery,Second Round,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Champions,2,0,2
Conf. Finals,4,0,4
Finals,2,0,2
R1/Lottery,40,4,44
Second Round,8,0,8
All,56,4,60


In [42]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

   Champions       0.00      0.00      0.00         2
Conf. Finals       0.00      0.00      0.00         4
      Finals       0.00      0.00      0.00         2
  R1/Lottery       0.71      0.91      0.80        44
Second Round       0.00      0.00      0.00         8

    accuracy                           0.67        60
   macro avg       0.14      0.18      0.16        60
weighted avg       0.52      0.67      0.59        60



In [43]:
f1_micro = metrics.f1_score(y_test,predictions,average='micro')
f1_macro = metrics.f1_score(y_test,predictions,average='macro')
f1_weighted = metrics.f1_score(y_test,predictions,average='weighted')

In [44]:
y_pred_proba = clf.predict_proba(x_test)[::,1]

In [45]:
test_output = pd.DataFrame({'Target':y_test,'prob':y_pred_proba})
metrics_list = ['F1 Micro','F1 Macro','F1 Weighted']
values = [f1_micro,f1_macro,f1_weighted]
test_results = pd.DataFrame({'METRIC':metrics_list,'VALUE':values})
test_results = test_results.transpose().reset_index(drop=True)
test_results.columns = test_results.iloc[0,:]
test_results = test_results.tail(1)

In [46]:
test_output.head()

Unnamed: 0,Target,prob
2,R1/Lottery,0.0605
3,R1/Lottery,0.058512
7,Finals,0.043786
8,Conf. Finals,0.065922
12,R1/Lottery,0.041323


In [47]:
test_results

Unnamed: 0,F1 Micro,F1 Macro,F1 Weighted
1,0.666667,0.16,0.586667


### Upcoming Season 

In [52]:
x_up = upcoming[clf.best_estimator_.feature_names_in_]
x_up.head()

Unnamed: 0,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT
4,17.248412,0.555474,0.240233,0.135582,0.279755,0.537575,0.270382,0.132214,0.209344,78.352941,212.705882,3.823529,0.5
9,18.70072,0.567396,0.221128,0.139701,0.204178,0.548612,0.247778,0.127512,0.215784,79.0,224.941176,6.647059,0.695
14,16.507773,0.530249,0.235302,0.161822,0.22779,0.524138,0.278149,0.153087,0.218486,78.176471,218.588235,5.882353,0.622
19,17.913739,0.524626,0.296855,0.137225,0.23927,0.539656,0.264343,0.142655,0.217808,78.588235,214.058824,5.0,0.512
24,19.072808,0.560886,0.257995,0.122774,0.212824,0.535775,0.251967,0.159039,0.203952,78.058824,212.0,5.588235,0.488


In [53]:
df['PO_WINS_CAT'].value_counts()

R1/Lottery      88
Second Round    16
Conf. Finals     8
Finals           4
Champions        4
Name: PO_WINS_CAT, dtype: int64

In [56]:
predictions = clf.predict(x_up)
y_pred_proba = clf.predict_proba(x_up)[::,1]
final_output = x_up.copy() 
final_output['Predicted Round'] = predictions
final_output['Probability'] = y_pred_proba
final_output['Team'] = upcoming['TEAM_NAME']
def sorter(x):
    if x == 'R1/Lottery':
        return 1
    elif x == 'Second Round':
        return 2
    elif x == 'Conf. Finals': 
        return 3
    elif x == 'Finals':
        return 4
    elif x == 'Champions':
        return 6
final_output['SORT_COL'] = [sorter(x) for x in final_output['Predicted Round']]
final_output.sort_values('SORT_COL',inplace=True)
final_output[['Team','Predicted Round','Probability']]

Unnamed: 0,Team,Predicted Round,Probability
4,Hawks,R1/Lottery,0.040581
139,Wizards,R1/Lottery,0.035561
134,Grizzlies,R1/Lottery,0.085795
129,Jazz,R1/Lottery,0.045453
124,Raptors,R1/Lottery,0.045344
119,Thunder,R1/Lottery,0.033889
114,Spurs,R1/Lottery,0.046099
109,Kings,R1/Lottery,0.025984
104,Trail Blazers,R1/Lottery,0.048029
94,76ers,R1/Lottery,0.067015
