# Part III: Model Development

Key Points:
1. Training dataset will be historical data from 2016-2021
2. Test dataset (for now) will be data from 2022 season
3. Models to Try:
   - Logistic Regression
   - K-Nearest Neighbors
   - Naive Bayes Classifier
   - LightGBM and XGBoost

---

### 1) Imports

In [1]:
pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Using cached lightgbm-4.1.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl (297.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
#pip install import-ipynb

In [10]:
import III_feature_engineering as III

In [11]:
from importlib import reload
reload(III);

In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import lightgbm as lgb
#from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RF

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler as SS
from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import roc_auc_score,accuracy_score,roc_curve
from sklearn.metrics import confusion_matrix, classification_report

In [6]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Load Cleaned Data produced from Notebook II
nba_clean_data = pd.read_csv('../data/cleaned_nba_data.csv')

---
### 2) Create Training and Testing Sets

In [76]:

def train_test_split(data,
                     model_vars,
                     year,
                     num_games_list,
                     remove_placeholder_target = False,
                     compute_avg = False,
                     compute_rolling_avg = False,
                     drop_na_rows=False
                    ):
    '''
    Return the training and testing features and targets for the given nba dataset:

    original_data = DataFrame with all NBA Box Score data from 2016-2022
    transformed_data = DataFrame with transformed features, such as rolling averages, W/L streaks

    '''
    if remove_placeholder_target:
        target_indices_drop = data[data['target'] == -1].index # Remove rows where target = -1 (i.e. last game of the season), so no outcome
        new_data = data.drop(target_indices_drop)
    
    
    # Subset training and testing data by year
    new_data = new_data.sort_values(['team','season','date']).reset_index(drop=True)
    new_data = new_data.sort_values(['season','date']).reset_index(drop=True) # Weird sorting to make sure train and test targets match up

    nba_train = new_data[new_data['season'] != year].reset_index(drop=True)
    nba_test = new_data[new_data['season'] == year].reset_index(drop=True)

    train_target = nba_train['target'] # Subset the targets before potentially computing rolling_averages
    test_target = nba_test['target']

    if compute_avg:
        train_data = III.create_averages(nba_train,
                                         model_vars,
                                         train_only=True,
                                         include_target=False
                                        )
        test_data = III.create_averages(nba_test,
                                        model_vars,
                                        train_only=True,
                                        include_target=False)
    
    elif compute_rolling_avg:
        train_data = III.create_rolling_averages(nba_train,
                                             model_vars,
                                             num_games_list,
                                             train_only=True
                                            ) 
        test_data = III.create_rolling_averages(nba_test,
                                           model_vars,
                                           num_games_list,
                                           train_only=True
                                           )

        if drop_na_rows: # For certain algorithms that don't recognize NaN values (i.e. first 5 rows of each team)
            nan_train_idx = train_data.index[train_data.isna().any(axis=1)].to_list()
            nan_test_idx = test_data.index[test_data.isna().any(axis=1)].to_list()
            
            train_data = train_data.drop(nan_train_idx).reset_index(drop=True)
            test_data = test_data.drop(nan_test_idx).reset_index(drop=True)

            # Need to drop the same row indices for target
            train_target = train_target.drop(nan_train_idx).reset_index(drop=True)
            test_target = test_target.drop(nan_test_idx).reset_index(drop=True)
            
    else:
        train_data = nba_train[model_vars]
        test_data = nba_test[model_vars]
    
    return train_data, train_target, test_data, test_target

### 2b) Establish Training and Testing Data

In [18]:
# Make sure functions work from other .py file
model_variables = III.create_model_variables(nba_clean_data) # Establish model variables
nba_transformed = III.create_rolling_averages(nba_clean_data, # Create new dataframe with two different rolling averages
                                          model_variables,
                                          num_games_list=[3,5],
                                          train_only=False)

In [37]:
# Establish training and testing sets
model_variables = III.create_model_variables(nba_clean_data) # Establish model variables
X_train, y_train, X_test, y_test = train_test_split(nba_clean_data,
                                                    model_variables,
                                                    2022,
                                                    [3,5,10], 
                                                    True,True,True)


### 3) Baseline Logistic Regression Model

1) Trial 1: Baseline (max_iter = 5000)
    - Parameters: 2 rolling averages (3 and 5 games)
    - Train Accuracy = 0.53
    - Test Accuracy = 0.52
    - Precision (Class 1) = 0.51
    - Recall (Class 1) = 0.61
    - F1-Score = 0.56

2) Trial 2: Only 5 game average:
    - Train = 0.52
    - Test = 0.49
    - Precision = 0.49
    - Recall = 0.64
    - F1-Score = 0.55

3) Trial 3: 3,5,10 game averages:
    - Train = 0.54
    - Test = 0.52
    - Precision = 0.52
    - Recall = 0.61
    - F1-Score = 0.56


In [38]:
# Train baseline logistic Regression Model
log_reg = LR(max_iter = 5000, random_state = 0) #baseline model
ss = SS()

train_s = ss.fit_transform(X_train) # Standardize of training and testing features
test_s = ss.transform(X_test) 

log_reg.fit(train_s,y_train) # fit logistic regression

[log_reg.score(train_s,y_train), log_reg.score(test_s,y_test)]

[0.534969233078193, 0.5191815856777494]

In [39]:
train_pred = log_reg.predict(train_s)
test_pred = log_reg.predict(test_s)

print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.52      0.43      0.47      1175
           1       0.52      0.61      0.56      1171

    accuracy                           0.52      2346
   macro avg       0.52      0.52      0.52      2346
weighted avg       0.52      0.52      0.52      2346



---
### 4) Logistic Regression with Consecutive Averages

In [81]:
model_variables = III.create_model_variables(nba_clean_data) # Establish model variables
nba_transformed = III.create_averages(nba_clean_data,model_variables,train_only=True,include_target=False)

X_train, y_train, X_test, y_test = train_test_split(nba_clean_data,
                                                    model_variables,
                                                    2022,
                                                    None, 
                                                    True,True,
                                                    False,False)


In [83]:
log_reg = LR(max_iter = 5000, random_state = 0) #baseline model
ss = SS()

train_s = ss.fit_transform(X_train) # Standardize of training and testing features
test_s = ss.transform(X_test) 

log_reg.fit(train_s,y_train) # fit logistic regression

[log_reg.score(train_s,y_train), log_reg.score(test_s,y_test)]

[0.5102368526696106, 0.5107033639143731]

In [84]:
train_pred = log_reg.predict(train_s)
test_pred = log_reg.predict(test_s)

print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.51      0.55      0.53      1308
           1       0.51      0.47      0.49      1308

    accuracy                           0.51      2616
   macro avg       0.51      0.51      0.51      2616
weighted avg       0.51      0.51      0.51      2616



---
### 5) K-Nearest Neighbors

In [85]:
train_s = ss.fit_transform(X_train) # Standardize of training and testing features
test_s = ss.transform(X_test) 

k = 13
knn = KNN(n_neighbors=k)
knn.fit(train_s,y_train) # fit logistic regression
#print(knn.score(train_s,y_train),knn.score(test_s, y_test))

test_pred = knn.predict(test_s)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51      1308
           1       0.50      0.48      0.49      1308

    accuracy                           0.50      2616
   macro avg       0.50      0.50      0.50      2616
weighted avg       0.50      0.50      0.50      2616



---
### 6) LightGBM Classiifer

In [88]:
lgb_classifier = lgb.LGBMClassifier(num_leaves=10, max_depth=2,random_state=0,verbose=-1)

lgb_classifier.fit(X_train,y_train)
print([lgb_classifier.score(X_train,y_train),lgb_classifier.score(X_test,y_test)])
test_pred = lgb_classifier.predict(X_test)
print(classification_report(y_test, test_pred))

[0.5501137428074401, 0.518348623853211]
              precision    recall  f1-score   support

           0       0.52      0.55      0.53      1308
           1       0.52      0.49      0.50      1308

    accuracy                           0.52      2616
   macro avg       0.52      0.52      0.52      2616
weighted avg       0.52      0.52      0.52      2616



---
## Part II: Combine Features

Note: Need to make more functions for train test split

In [101]:
model_variables = III.create_model_variables(nba_clean_data) # Establish model variables
nba_transformed = III.create_averages(nba_clean_data,model_variables,train_only=True,include_target=False)

X_train, y_train, X_test, y_test = train_test_split(nba_clean_data,
                                                    model_variables,
                                                    2022,
                                                    [3,5], 
                                                    True,False,
                                                    True,False)


In [102]:
X_train

Unnamed: 0,fga_avg_L3,fg%_avg_L3,3pa_avg_L3,3p%_avg_L3,fta_avg_L3,ft%_avg_L3,orb_avg_L3,trb_avg_L3,ast_avg_L3,stl_avg_L3,...,tov_opp_avg_L5,pf_opp_avg_L5,pts_opp_avg_L5,ts%_opp_avg_L5,efg%_opp_avg_L5,3par_opp_avg_L5,drb%_opp_avg_L5,trb%_opp_avg_L5,ortg_opp_avg_L5,drtg_opp_avg_L5
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,82.666667,0.463667,24.666667,0.353667,21.000000,0.755000,7.333333,41.333333,23.666667,9.666667,...,,,,,,,,,,
3,84.666667,0.453333,25.333333,0.335333,20.666667,0.798000,8.000000,44.000000,23.666667,9.666667,...,,,,,,,,,,
4,87.000000,0.421667,24.333333,0.307333,19.333333,0.825000,11.000000,48.000000,23.333333,9.333333,...,16.6,18.8,97.0,0.4958,0.4580,0.3422,79.74,53.34,100.26,102.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14941,92.000000,0.500000,26.666667,0.322000,22.333333,0.809667,10.000000,51.333333,24.333333,5.333333,...,13.2,18.6,113.6,0.5522,0.5042,0.4342,77.84,48.18,109.36,113.80
14942,91.000000,0.524333,23.000000,0.347667,20.333333,0.806333,9.000000,47.333333,25.666667,5.333333,...,12.6,18.0,114.6,0.5464,0.4998,0.4262,77.12,47.10,110.90,114.76
14943,91.000000,0.513333,23.333333,0.330333,23.666667,0.734000,9.000000,44.333333,27.333333,4.666667,...,11.4,18.0,117.6,0.5584,0.5166,0.3882,76.82,48.00,114.30,110.18
14944,92.000000,0.451667,25.666667,0.240000,23.333333,0.731000,11.000000,40.666667,22.000000,4.000000,...,12.2,20.0,122.0,0.5910,0.5598,0.3682,77.02,50.24,118.90,108.06


In [103]:
X_train_2, y_train_2, X_test_2, y_test_2 = train_test_split(nba_clean_data,
                                                            model_variables,
                                                            2022,
                                                            None, 
                                                            True,True,
                                                            False,False)

In [107]:
train = pd.concat([X_train_2,X_train],axis=1)
test = pd.concat([X_test_2,X_test],axis=1)

In [108]:
lgb_classifier = lgb.LGBMClassifier(num_leaves=10, max_depth=2,random_state=0,verbose=-1)

lgb_classifier.fit(train,y_train)
print([lgb_classifier.score(train,y_train),lgb_classifier.score(test,y_test)])
test_pred = lgb_classifier.predict(test)
print(classification_report(y_test, test_pred))

[0.5885855747357153, 0.5053516819571865]
              precision    recall  f1-score   support

           0       0.51      0.39      0.44      1308
           1       0.50      0.62      0.56      1308

    accuracy                           0.51      2616
   macro avg       0.51      0.51      0.50      2616
weighted avg       0.51      0.51      0.50      2616

