In [1]:

import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType
from nba_api.stats.static import teams
from nba_api.stats.endpoints import playbyplay
import numpy as np
import time
import random
from statistics import multimode
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [2]:
def create_player_ids(csv_location):
    '''
    Input: Location of Aggregated Sixth Man Data
    Output: DataFrame with randomized Player ID's, and the name, team, and season removed.
    '''
    df = pd.read_csv(csv_location)
    df = df.sample(frac=1, random_state=10).reset_index(drop=True)
    df.index.name = 'Player ID'
    return df.drop(['PLAYER_NAME', 'Team', 'Season'],axis=1)
    

df = create_player_ids('data/sixth_men_season_stats.csv')

In [3]:
def train_test_csvs(df):
    '''
    Input: Sixth Man df with player ID's
    Output in order: train_df, test_df, solution_df
    '''
    y = df['WL']
    X = df.drop(['WL'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    train_df = X_train.merge(y_train, right_index=True, left_index=True)
    return train_df, X_test, y_test
    
train_df, test_df, solution_df = train_test_csvs(df)


In [80]:
def save_csvs(train_df, test_df, solution_df, example_df):
    train_df.to_csv('Kaggle_Competition_Materials/Available_CSVs/train.csv')
    test_df.to_csv('Kaggle_Competition_Materials/Available_CSVs/test.csv')
    example_df.to_csv('Kaggle_Competition_Materials/Available_CSVs/example_submission.csv')
    solution_df.to_csv('Kaggle_Competition_Materials/Hidden_CSV/solutions.csv')
save_csvs(train_df, test_df, solution_df, example_df)

In [96]:
solution_df

Player ID
429    0.269231
23     0.585714
189    0.636364
47     0.492063
361    0.604938
         ...   
88     0.682927
363    0.608108
360    0.719512
394    0.512500
262    0.222222
Name: WL, Length: 137, dtype: float64

In [78]:
def generate_example_submission(solution_df):
    random.seed(10)
    example_df = solution_df.to_frame()
    random_values = [random.uniform(0,1) for x in range(len(solution_df))]
    example_df['WL'] = random_values
    return example_df
example_df = generate_example_submission(solution_df)



In [33]:
test_list = []
season_list = ['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', \
              '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', \
              '2018-19', '2019-20', '2020-21', '2021-22']


In [12]:
gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=1610612754,
                            season_nullable=Season.default,
                            season_type_nullable=SeasonType.regular)  

games_dict = gamefinder.get_normalized_dict()
games = games_dict['LeagueGameFinderResults']
game = games[0]
game_id = game['GAME_ID']
df = playbyplay.PlayByPlay(game_id).get_data_frames()[0]
df.head()
df = df[df['EVENTMSGTYPE'] == 8]
# df = df[df['HOMEDESCRIPTION'].notnull()].reset_index(drop=True)
df

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,SCORE,SCOREMARGIN
41,22200351,63,8,0,1,9:21 PM,7:18,SUB: Winslow FOR Nurkic,,,,
42,22200351,64,8,0,1,9:21 PM,7:18,SUB: Eubanks FOR Simons,,,,
46,22200351,72,8,0,1,9:22 PM,7:04,,,SUB: Mathurin FOR Hield,,
47,22200351,73,8,0,1,9:22 PM,7:04,,,SUB: Jackson FOR Turner,,
63,22200351,97,8,0,1,9:27 PM,5:43,,,SUB: Johnson FOR Smith,,
75,22200351,114,8,0,1,9:29 PM,4:36,SUB: Sharpe FOR Hart,,,,
76,22200351,115,8,0,1,9:29 PM,4:36,,,SUB: Hield FOR Nembhard,,
77,22200351,116,8,0,1,9:29 PM,4:36,,,SUB: Brissett FOR Nesmith,,
85,22200351,130,8,0,1,9:31 PM,4:19,SUB: Watford FOR Grant,,,,
97,22200351,150,8,0,1,9:34 PM,2:50,SUB: Simons FOR Lillard,,,,


In [20]:
pd.DataFrame({'PlayerID': [100,101,102], 'Win Percentage': ['.450', '.428', '.557']})


Unnamed: 0,PlayerID,Win Percentage
0,100,0.45
1,101,0.428
2,102,0.557


In [2]:
def get_team_and_year(nba_teams, season_list, sixth_man_df_list):
    # IF the Df exists
    if len(sixth_man_df_list) == 1:
        df = sixth_man_df_list[0]
        last_team = df.loc[len(df) - 1, 'Team']
        last_season = df.loc[len(df) - 1, 'Season']
        # Get team_list
        for i in range(len(nba_teams)):
            if last_team == nba_teams[i]['full_name']:
                used_team_list = nba_teams[i:]
        # Get correct season_list and update team_list if need be
        if last_season == '2021-22':
            used_season_list = season_list
            used_team_list = used_team_list[1:]
        else: 
            correct_index = season_list.index(last_season)
            used_season_list = season_list[(correct_index+1):]
    # If df does not exist
    else: 
        used_season_list = season_list
        used_team_list = nba_teams

    return used_season_list, used_team_list
 



In [4]:
def linear_prediction_benchmark(train_df, test_df, solution_df):
    y = train_df['WL']
    X = train_df.drop(['WL'], axis=1)
    reg = LinearRegression().fit(X, y)
    test_predictions = reg.predict(test_df)
    return mean_absolute_error(solution_df, test_predictions)
linear_prediction_benchmark(train_df, test_df, solution_df)

0.06624659699198103

In [28]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
X_test = test_df
mlp = MLPRegressor(random_state=1, max_iter=10000)
y_train = train_df['WL']
X_train = train_df.drop(['WL'], axis=1)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
mlp.fit(X_train_scaled, y_train)
test_predictions = mlp.predict(X_test_scaled)
mean_absolute_error(solution_df, test_predictions)




0.07244689323334369

In [32]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(150,100,50), (120,80,40), (100,50,30)],
    'max_iter': [50, 100],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

grid = GridSearchCV(MLPRegressor(), param_grid, n_jobs= -1, cv=5)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_) 

{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (150, 100, 50), 'learning_rate': 'adaptive', 'max_iter': 100, 'solver': 'adam'}


In [175]:

numbers = [0, .005, .01, .015, .02, .025, .03, .035, .04, .045, .05]
for i in numbers: 
    xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                      n_estimators = 20, seed = 109)
    xgb_r.fit(X_train, y_train)
    xgb_predictions = xgb_r.predict(X_test)
    importances = xgb_r.feature_importances_
    importances_mask = [True if importance > i else False for importance in importances]
    X_train_masked = X_train.loc[:, importances_mask]
    X_test_masked = X_test.loc[:, importances_mask]
    print(len(X_train_masked.columns))

    # Scale Data
    scaler = MinMaxScaler()
    X_train_masked_scaled = scaler.fit_transform(X_train_masked)
    X_test_masked_scaled = scaler.transform(X_test_masked)

    # Linear Regression
    model = LinearRegression()
    model.fit(X_train_masked_scaled, y_train)
    linear_predictions = model.predict(X_test_masked_scaled)
    print('Linear Regression: ' + str(mean_absolute_error(solution_df, linear_predictions)))

    # After choosing important features
    model = SVR()
    model.fit(X_train_masked_scaled, y_train)
    rf_predictions = model.predict(X_test_masked_scaled)
    print('SVM: ' + str(mean_absolute_error(solution_df, rf_predictions)))

    param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
    grid.fit(X_train_masked_scaled,y_train)
    grid_predictions = grid.predict(X_test_masked_scaled)
    print('SVM Tuned: ' + str(mean_absolute_error(solution_df, grid_predictions)))

20
Linear Regression: 0.06624659699198086
SVM: 0.07256182902792031
SVM Tuned: 0.06384150846074034
20
Linear Regression: 0.06624659699198086
SVM: 0.07256182902792031
SVM Tuned: 0.06835770363938705
19
Linear Regression: 0.0655765632940521
SVM: 0.07288187991579594
SVM Tuned: 0.06952955815655668
19
Linear Regression: 0.0655765632940521
SVM: 0.07288187991579594
SVM Tuned: 0.06596145583351896
12
Linear Regression: 0.06511390762677932
SVM: 0.07487391834610378




SVM Tuned: 0.06832618596995493
8
Linear Regression: 0.06548978905646279
SVM: 0.07412277668208755
SVM Tuned: 0.06501730353178406
6
Linear Regression: 0.0654836501350644
SVM: 0.07101805095571592
SVM Tuned: 0.06672591501640254
3
Linear Regression: 0.06607923761896843
SVM: 0.06946870918194482
SVM Tuned: 0.06482995954426546
3
Linear Regression: 0.06607923761896843
SVM: 0.06946870918194482
SVM Tuned: 0.06671138611462071
3
Linear Regression: 0.06607923761896843
SVM: 0.06946870918194482
SVM Tuned: 0.06593931093332225
1
Linear Regression: 0.06711656917277282
SVM: 0.06434227707275138
SVM Tuned: 0.06520749433256534


In [173]:
import xgboost as xg
import statistics
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 20, seed = 109)
xgb_r.fit(X_train, y_train)
xgb_predictions = xgb_r.predict(X_test)
importances = xgb_r.feature_importances_
importances_mask = [True if importance > statistics.mean(importances) else False for importance in importances]
X_train_masked = X_train.loc[:, importances_mask]
X_test_masked = X_test.loc[:, importances_mask]
print(len(X_train_masked.columns))

# Scale Data
scaler = MinMaxScaler()
X_train_masked_scaled = scaler.fit_transform(X_train_masked)
X_test_masked_scaled = scaler.transform(X_test_masked)

# Linear Regression
model = LinearRegression()
model.fit(X_train_masked_scaled, y_train)
linear_predictions = model.predict(X_test_masked_scaled)
print('Linear: ' + str(mean_absolute_error(solution_df, linear_predictions)))

# After choosing important features
model = SVR()
model.fit(X_train_masked_scaled, y_train)
rf_predictions = model.predict(X_test_masked_scaled)
print('SVM: ' + str(mean_absolute_error(solution_df, rf_predictions)))

# Hyperparameter Tuning
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid.fit(X_train_masked_scaled,y_train)
grid_predictions = grid.predict(X_test_masked_scaled)
print('SVM Tuned: ' + str(mean_absolute_error(solution_df, grid_predictions)))

1
SVR: 0.06711656917277282
SVR: 0.06434227707275138
SVR Tuned: 0.06738080353679402


In [None]:
# Best Results: 
# Linear Regression with feature selection using XGboost: 0.0645401350863056
# MLPRegressor Neural Network with feature selection, scaling and hyperparameter tuning: 0.06493841239385763
# SVM: 0.06434227707275138


In [133]:
rf_best = RandomForestRegressor(n_estimators = 100, min_samples_split = 10, 
 min_samples_leaf= 4, max_features = 'auto', max_depth= 50, bootstrap=True)
rf.fit(X_train_masked_scaled, y_train)
predictions = rf.predict(X_test_masked_scaled)
mean_absolute_error(solution_df, predictions)


0.07460179904630126

In [33]:
grid_predictions = grid.predict(X_test_scaled) 
mean_absolute_error(solution_df, grid_predictions)
mean_absolute_error(solution_df, rf_predictions)

0.06799657753641032

In [103]:
n_estimators = [5,20,50,100] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}

In [111]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': [5, 20, 50, 100]},
                   random_state=35, verbose=2)

In [115]:
print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

randmf = RandomForestRegressor(n_estimators = 100, min_samples_split = 10, 
 min_samples_leaf= 4, max_features = 'auto', max_depth= 100, bootstrap=True) 

Random grid:  {'n_estimators': [5, 20, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 50, 'bootstrap': True}  



In [21]:
test_list = [['Maloney'], ['Henderson'], ['Bowdler'], ['Mottola'], ['Mottola'], ['Mottola'], ['Wright'], ['Wright', 'Mottola', 'Maloney'], ['Wright', 'Mottola'], ['Wright'], ['Maloney'], ['Wright'], ['Glover'], ['Mottola', 'Maloney'], ['Wright'], ['Mottola', 'Maloney'], ['Crawford', 'Maloney'], ['Henderson'], ['Glover', 'Henderson'], ['Sanchez'], ['Mohammed', 'Johnson'], ['Johnson'], ['Mohammed', 'Kukoc'], ['Sanchez'], ['Mohammed', 'Kukoc'], ['Johnson'], ['Mottola', 'Glover', 'Mohammed'], ['Bowdler', 'Mottola'], ['Robinson', 'Henderson'], ['Henderson'], ['Glover'], ['Crawford'], ['Crawford'], ['McLeod', 'Henderson'], ['Crawford'], ['Crawford'], ['Smith'], ['Knight'], ['Henderson'], ['McLeod'], ['McLeod'], ['Crawford', 'Smith', 'Henderson'], ['Crawford'], ['Glover'], ['Colson', 'Glover', 'Henderson'], ['Henderson'], ['Glover'], ['Glover', 'Maloney'], ['Henderson'], ['Knight', 'Glover', 'Henderson'], ['Knight', 'Mottola'], ['Johnson'], ['Jackson'], ['Jackson', 'A. Johnson'], ['Mottola', 'Jackson'], ['Robinson'], ['Mottola', 'A. Johnson'], ['D. Johnson'], ['Mottola', 'D. Johnson'], ['McLeod'], ['Robinson'], ['Wright'], ['Robinson'], ['Mottola', 'Wright'], ['Robinson', 'Wright'], ['Mottola', 'Wright'], ['Wright'], ['D. Johnson'], ['D. Johnson'], ['A. Johnson', 'Wright'], ['Wright'], ['Terry', 'Wright', 'Mottola'], ['Wright'], ['Robinson'], ['Henderson'], ['A. Johnson', 'D. Johnson'], ['Wright'], ['Maloney'], ['Wright'], ['D. Johnson', 'A. Johnson'], ['Crawford'], ['Crawford']]

In [82]:
from Kaggle_Competition_Materials.Kaggle_Comp import generate_all_csvs
generate_all_csvs()

In [3]:
from sixthman import sixth_man_main
for i in range(100):
  for attempt in range(10):
    try:
      sixth_man_main()
    except:
      continue
    else:
      break
  else:
    print('Failed too many Times')
