In [246]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import joblib

#models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC

#feature selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

#scalers
from sklearn.preprocessing import StandardScaler

#score function
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [247]:
stats_df = pd.read_csv('../data/NBA_Stats_MVP.csv')
stats_df

Unnamed: 0,Year,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,...,TRB,AST,STL,BLK,TOV,PF,PTS,is_allstar,Name,was_mvp
0,1974.0,C,27.0,HOU,79.0,,2459.0,15.9,0.516,,...,923.0,166.0,80.0,104.0,,227.0,865.0,0,Zaid Abdul-Aziz,0
1,1974.0,C,26.0,MIL,81.0,,3548.0,24.4,0.564,,...,1178.0,386.0,112.0,283.0,,238.0,2191.0,1,Kareem Abdul-Jabbar,1
2,1974.0,SF,26.0,DET,74.0,,2298.0,10.9,0.457,,...,448.0,141.0,110.0,12.0,,242.0,759.0,0,Don Adams,0
3,1974.0,PG,27.0,CHI,55.0,,618.0,10.0,0.447,,...,69.0,56.0,36.0,1.0,,63.0,182.0,0,Rick Adelman,0
4,1974.0,PG,26.0,MIL,72.0,,2388.0,18.8,0.536,,...,291.0,374.0,137.0,22.0,,215.0,1268.0,0,Lucius Allen,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,...,405.0,99.0,62.0,58.0,65.0,189.0,639.0,0,Cody Zeller,0
20793,2017.0,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,...,124.0,42.0,7.0,21.0,20.0,61.0,178.0,0,Tyler Zeller,0
20794,2017.0,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.000,...,35.0,4.0,2.0,5.0,3.0,17.0,23.0,0,Stephen Zimmerman,0
20795,2017.0,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,...,125.0,36.0,15.0,16.0,40.0,78.0,240.0,0,Paul Zipser,0


In [248]:
stats_df.fillna(0, inplace = True)

In [249]:
def per_game(stats_df, cols):
    """
    returns a new data frame version of stats_df, where a per game average is added for each column name 
    contained within cols
    
    :param stats_df: A dataframe containing basketball stats.
    :param cols: A list of column names where per game averages are desired.
    :return: A new dataframe containing per game averages for stats contained within cols. 
    :rtype: Dataframe
    """
    new_df = stats_df.copy()
    for col in cols:
        new_df[col + 'PG'] = stats_df[col]/stats_df['G']
    return new_df

In [250]:
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
stats_df = per_game(stats_df, per_game_cols)

In [251]:
stats_df

Unnamed: 0,Year,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,...,TRBPG,STLPG,BLKPG,TOVPG,PFPG,FGAPG,FGPG,FTPG,FTAPG,MPPG
0,1974.0,C,27.0,HOU,79.0,0.0,2459.0,15.9,0.516,0.000,...,11.683544,1.012658,1.316456,0.000000,2.873418,9.265823,4.253165,2.443038,3.037975,31.126582
1,1974.0,C,26.0,MIL,81.0,0.0,3548.0,24.4,0.564,0.000,...,14.543210,1.382716,3.493827,0.000000,2.938272,21.716049,11.703704,3.641975,5.185185,43.802469
2,1974.0,SF,26.0,DET,74.0,0.0,2298.0,10.9,0.457,0.000,...,6.054054,1.486486,0.162162,0.000000,3.270270,10.027027,4.094595,2.067568,2.716216,31.054054
3,1974.0,PG,27.0,CHI,55.0,0.0,618.0,10.0,0.447,0.000,...,1.254545,0.654545,0.018182,0.000000,1.145455,3.090909,1.163636,0.981818,1.381818,11.236364
4,1974.0,PG,26.0,MIL,72.0,0.0,2388.0,18.8,0.536,0.000,...,4.041667,1.902778,0.305556,0.000000,2.986111,14.750000,7.305556,3.000000,3.805556,33.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,...,6.532258,1.000000,0.935484,1.048387,3.048387,7.145161,4.080645,2.145161,3.161290,27.822581
20793,2017.0,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,...,2.431373,0.137255,0.411765,0.392157,1.196078,3.098039,1.529412,0.431373,0.764706,10.294118
20794,2017.0,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.000,...,1.842105,0.105263,0.263158,0.157895,0.894737,1.631579,0.526316,0.157895,0.263158,5.684211
20795,2017.0,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,...,2.840909,0.340909,0.363636,0.909091,1.772727,5.022727,2.000000,0.704545,0.909091,19.159091


# Modeling

In [252]:
stats_df = stats_df.drop(columns = ['Pos', 'Name', 'Tm'])
#stats_df = stats_df.drop(columns = ['Pos', 'Name', 'Tm', 'Year'])
X = stats_df.drop(columns = ['was_mvp'])
y = stats_df['was_mvp']

In [253]:
stats_df

Unnamed: 0,Year,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,...,TRBPG,STLPG,BLKPG,TOVPG,PFPG,FGAPG,FGPG,FTPG,FTAPG,MPPG
0,1974.0,27.0,79.0,0.0,2459.0,15.9,0.516,0.000,0.328,11.4,...,11.683544,1.012658,1.316456,0.000000,2.873418,9.265823,4.253165,2.443038,3.037975,31.126582
1,1974.0,26.0,81.0,0.0,3548.0,24.4,0.564,0.000,0.239,8.8,...,14.543210,1.382716,3.493827,0.000000,2.938272,21.716049,11.703704,3.641975,5.185185,43.802469
2,1974.0,26.0,74.0,0.0,2298.0,10.9,0.457,0.000,0.271,6.0,...,6.054054,1.486486,0.162162,0.000000,3.270270,10.027027,4.094595,2.067568,2.716216,31.054054
3,1974.0,27.0,55.0,0.0,618.0,10.0,0.447,0.000,0.447,2.6,...,1.254545,0.654545,0.018182,0.000000,1.145455,3.090909,1.163636,0.981818,1.381818,11.236364
4,1974.0,26.0,72.0,0.0,2388.0,18.8,0.536,0.000,0.258,4.1,...,4.041667,1.902778,0.305556,0.000000,2.986111,14.750000,7.305556,3.000000,3.805556,33.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,24.0,62.0,58.0,1725.0,16.7,0.604,0.002,0.442,8.6,...,6.532258,1.000000,0.935484,1.048387,3.048387,7.145161,4.080645,2.145161,3.161290,27.822581
20793,2017.0,27.0,51.0,5.0,525.0,13.0,0.508,0.006,0.247,9.2,...,2.431373,0.137255,0.411765,0.392157,1.196078,3.098039,1.529412,0.431373,0.764706,10.294118
20794,2017.0,20.0,19.0,0.0,108.0,7.3,0.346,0.000,0.161,10.8,...,1.842105,0.105263,0.263158,0.157895,0.894737,1.631579,0.526316,0.157895,0.263158,5.684211
20795,2017.0,22.0,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,...,2.840909,0.340909,0.363636,0.909091,1.772727,5.022727,2.000000,0.704545,0.909091,19.159091


In [254]:
stats_df['Year'].unique()
train_years = np.random.choice(a=stats_df['Year'].unique(), size=round(len(stats_df['Year'].unique()) * .7))
train_years

array([1978., 1991., 2013., 1983., 2010., 1996., 1996., 2008., 1998.,
       2007., 1981., 1983., 1981., 1979., 2012., 1993., 1974., 1996.,
       2011., 2013., 1999., 1983., 2007., 1981., 1986., 2016., 2011.,
       1988., 1985., 1974., 2009.])

In [255]:
train = stats_df[stats_df['Year'].isin(train_years)]
test = stats_df[~stats_df['Year'].isin(train_years)]

In [256]:
train = train.drop(columns = ['Year'])
test = test.drop(columns = ['Year'])

In [257]:
X_train = train.drop(columns = ['was_mvp'])
y_train = train['was_mvp']

In [258]:
X_test = test.drop(columns = ['was_mvp'])
y_test = test['was_mvp']

In [259]:
models = {'RandomForestClassifier':{'model':RFC()}, 
        'KNeighborsClassifier':{'model':KNeighborsClassifier()},
         'GradientBoostingClassifier': {'model':GBC()},
         'logistic': {'model':LogisticRegression(max_iter=10000)},
         'adaboost': {'model':ABC()},
         'ScaledRandomForestClassifier':{'model':make_pipeline(StandardScaler(), RFC())}, 
        'ScaledKNeighborsClassifier':{'model':make_pipeline(StandardScaler(), KNeighborsClassifier())},
         'ScaledGradientBoostingClassifier': {'model':make_pipeline(StandardScaler(), GBC())},
         'Scaledlogistic': {'model':make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000))},
         'Scaledadaboost': {'model':make_pipeline(StandardScaler(), ABC())}}

In [267]:
def score_models(X_train, y_train, X_test, y_test):
    models = {'RandomForestClassifier':{'model':RFC()}, 
        'KNeighborsClassifier':{'model':KNeighborsClassifier()},
         'GradientBoostingClassifier': {'model':GBC()},
         'logistic': {'model':LogisticRegression(max_iter=10000)},
         'adaboost': {'model':ABC()},
         'ScaledRandomForestClassifier':{'model':make_pipeline(StandardScaler(), RFC())}, 
        'ScaledKNeighborsClassifier':{'model':make_pipeline(StandardScaler(), KNeighborsClassifier())},
         'ScaledGradientBoostingClassifier': {'model':make_pipeline(StandardScaler(), GBC())},
         'Scaledlogistic': {'model':make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000))},
         'Scaledadaboost': {'model':make_pipeline(StandardScaler(), ABC())}}
    for model_name in models:
        print(model_name)
        model = models[model_name]['model']
        fitted_model = model.fit(X_train, y_train)
        models[model_name]['train_f1_score'] = f1_score(y_train, fitted_model.predict(X_train), average='macro')
        models[model_name]['train_accuracy_score'] = accuracy_score(y_train, fitted_model.predict(X_train))
        models[model_name]['test_f1_score'] = f1_score(y_test, fitted_model.predict(X_test), average='macro')
        models[model_name]['test_accuracy_score'] = accuracy_score(y_test, fitted_model.predict(X_test))
        models[model_name]['fitted_model'] = fitted_model
        models[model_name]['y_train_pred'] = fitted_model.predict(X_train)
        models[model_name]['y_test_pred'] = fitted_model.predict(X_test) 
        tn, fp, fn, tp = confusion_matrix(y_test, models[model_name]['y_test_pred']).ravel()
        models[model_name]['tp'] = tp
        models[model_name]['fp'] = fp
        models[model_name]['fn'] = fn
        models[model_name]['tn'] = tn
        print(confusion_matrix(y_test, models[model_name]['y_test_pred']))
    return models


In [None]:
models = score_models(X_train, y_train, X_test, y_test)

RandomForestClassifier
[[10800     3]
 [   20     3]]
KNeighborsClassifier
[[10799     4]
 [   21     2]]
GradientBoostingClassifier


In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
#model = models['ScaledGradientBoostingClassifier']['fitted_model']
selector = RFECV(DecisionTreeClassifier(), step=1, cv=5)
selector = selector.fit(X_train, y_train)
selector.support_
selector.ranking_

In [None]:
to_drop = []
for i in range(X_train.shape[1]):
    if selector.support_[i] == True:
        to_drop.append(X_train.iloc[:, i].name)
        print('Column: %s, Selected %s, Rank: %.3f' % (X_train.iloc[:, i].name, selector.support_[i], selector.ranking_[i]))

In [None]:
X_selected_train = train.drop(columns = to_drop + ['was_mvp'])
y_selected_train = train['was_mvp']

In [None]:
X_selected_test = test.drop(columns = to_drop + ['was_mvp'])
y_selected_test = test['was_mvp']

In [None]:
feat_select_models = score_models(X_selected_train, y_selected_train, X_selected_test, y_selected_test)

In [None]:
metrics = pd.DataFrame.from_dict(feat_select_models, 'index')[['train_f1_score', 'train_accuracy_score', 'test_f1_score', 'test_accuracy_score', 'tp', 'fp', 'fn', 'tn']]

In [None]:
metrics.sort_values(by='tp', ascending=False)

In [None]:
metrics = pd.DataFrame.from_dict(feat_select_models, 'index')[['train_f1_score', 'train_accuracy_score', 'test_f1_score', 'test_accuracy_score', 'tp', 'fp', 'fn', 'tn']]

In [None]:
metrics.sort_values(by='tp', ascending=False)