In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import joblib

#models
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC

#feature selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

#scalers
from sklearn.preprocessing import StandardScaler

#score function
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
stats_df = pd.read_csv('../data/NBA_Stats_MVP.csv')
stats_df

Unnamed: 0,Year,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,...,TRB,AST,STL,BLK,TOV,PF,PTS,is_allstar,Name,was_mvp
0,1974.0,C,27.0,HOU,79.0,,2459.0,15.9,0.516,,...,923.0,166.0,80.0,104.0,,227.0,865.0,0,Zaid Abdul-Aziz,0
1,1974.0,C,26.0,MIL,81.0,,3548.0,24.4,0.564,,...,1178.0,386.0,112.0,283.0,,238.0,2191.0,1,Kareem Abdul-Jabbar,1
2,1974.0,SF,26.0,DET,74.0,,2298.0,10.9,0.457,,...,448.0,141.0,110.0,12.0,,242.0,759.0,0,Don Adams,0
3,1974.0,PG,27.0,CHI,55.0,,618.0,10.0,0.447,,...,69.0,56.0,36.0,1.0,,63.0,182.0,0,Rick Adelman,0
4,1974.0,PG,26.0,MIL,72.0,,2388.0,18.8,0.536,,...,291.0,374.0,137.0,22.0,,215.0,1268.0,0,Lucius Allen,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,...,405.0,99.0,62.0,58.0,65.0,189.0,639.0,0,Cody Zeller,0
20793,2017.0,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,...,124.0,42.0,7.0,21.0,20.0,61.0,178.0,0,Tyler Zeller,0
20794,2017.0,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.000,...,35.0,4.0,2.0,5.0,3.0,17.0,23.0,0,Stephen Zimmerman,0
20795,2017.0,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,...,125.0,36.0,15.0,16.0,40.0,78.0,240.0,0,Paul Zipser,0


In [3]:
stats_df.fillna(0, inplace = True)

In [4]:
def per_game(stats_df, cols):
    """
    returns a new data frame version of stats_df, where a per game average is added for each column name 
    contained within cols
    
    :param stats_df: A dataframe containing basketball stats.
    :param cols: A list of column names where per game averages are desired.
    :return: A new dataframe containing per game averages for stats contained within cols. 
    :rtype: Dataframe
    """
    new_df = stats_df.copy()
    for col in cols:
        new_df[col + 'PG'] = stats_df[col]/stats_df['G']
    return new_df

In [5]:
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
stats_df = per_game(stats_df, per_game_cols)

In [6]:
stats_df 

Unnamed: 0,Year,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,...,TRBPG,STLPG,BLKPG,TOVPG,PFPG,FGAPG,FGPG,FTPG,FTAPG,MPPG
0,1974.0,C,27.0,HOU,79.0,0.0,2459.0,15.9,0.516,0.000,...,11.683544,1.012658,1.316456,0.000000,2.873418,9.265823,4.253165,2.443038,3.037975,31.126582
1,1974.0,C,26.0,MIL,81.0,0.0,3548.0,24.4,0.564,0.000,...,14.543210,1.382716,3.493827,0.000000,2.938272,21.716049,11.703704,3.641975,5.185185,43.802469
2,1974.0,SF,26.0,DET,74.0,0.0,2298.0,10.9,0.457,0.000,...,6.054054,1.486486,0.162162,0.000000,3.270270,10.027027,4.094595,2.067568,2.716216,31.054054
3,1974.0,PG,27.0,CHI,55.0,0.0,618.0,10.0,0.447,0.000,...,1.254545,0.654545,0.018182,0.000000,1.145455,3.090909,1.163636,0.981818,1.381818,11.236364
4,1974.0,PG,26.0,MIL,72.0,0.0,2388.0,18.8,0.536,0.000,...,4.041667,1.902778,0.305556,0.000000,2.986111,14.750000,7.305556,3.000000,3.805556,33.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,...,6.532258,1.000000,0.935484,1.048387,3.048387,7.145161,4.080645,2.145161,3.161290,27.822581
20793,2017.0,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,...,2.431373,0.137255,0.411765,0.392157,1.196078,3.098039,1.529412,0.431373,0.764706,10.294118
20794,2017.0,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.000,...,1.842105,0.105263,0.263158,0.157895,0.894737,1.631579,0.526316,0.157895,0.263158,5.684211
20795,2017.0,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,...,2.840909,0.340909,0.363636,0.909091,1.772727,5.022727,2.000000,0.704545,0.909091,19.159091


# Modeling

In [7]:
stats_df = stats_df.drop(columns = ['Pos', 'Name', 'Tm'])
#stats_df = stats_df.drop(columns = ['Pos', 'Name', 'Tm', 'Year'])
X = stats_df.drop(columns = ['was_mvp'])
y = stats_df['was_mvp']

In [8]:
stats_df

Unnamed: 0,Year,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,...,TRBPG,STLPG,BLKPG,TOVPG,PFPG,FGAPG,FGPG,FTPG,FTAPG,MPPG
0,1974.0,27.0,79.0,0.0,2459.0,15.9,0.516,0.000,0.328,11.4,...,11.683544,1.012658,1.316456,0.000000,2.873418,9.265823,4.253165,2.443038,3.037975,31.126582
1,1974.0,26.0,81.0,0.0,3548.0,24.4,0.564,0.000,0.239,8.8,...,14.543210,1.382716,3.493827,0.000000,2.938272,21.716049,11.703704,3.641975,5.185185,43.802469
2,1974.0,26.0,74.0,0.0,2298.0,10.9,0.457,0.000,0.271,6.0,...,6.054054,1.486486,0.162162,0.000000,3.270270,10.027027,4.094595,2.067568,2.716216,31.054054
3,1974.0,27.0,55.0,0.0,618.0,10.0,0.447,0.000,0.447,2.6,...,1.254545,0.654545,0.018182,0.000000,1.145455,3.090909,1.163636,0.981818,1.381818,11.236364
4,1974.0,26.0,72.0,0.0,2388.0,18.8,0.536,0.000,0.258,4.1,...,4.041667,1.902778,0.305556,0.000000,2.986111,14.750000,7.305556,3.000000,3.805556,33.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20792,2017.0,24.0,62.0,58.0,1725.0,16.7,0.604,0.002,0.442,8.6,...,6.532258,1.000000,0.935484,1.048387,3.048387,7.145161,4.080645,2.145161,3.161290,27.822581
20793,2017.0,27.0,51.0,5.0,525.0,13.0,0.508,0.006,0.247,9.2,...,2.431373,0.137255,0.411765,0.392157,1.196078,3.098039,1.529412,0.431373,0.764706,10.294118
20794,2017.0,20.0,19.0,0.0,108.0,7.3,0.346,0.000,0.161,10.8,...,1.842105,0.105263,0.263158,0.157895,0.894737,1.631579,0.526316,0.157895,0.263158,5.684211
20795,2017.0,22.0,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,...,2.840909,0.340909,0.363636,0.909091,1.772727,5.022727,2.000000,0.704545,0.909091,19.159091


In [9]:
stats_df['Year'].unique()
train_years = np.random.choice(a=stats_df['Year'].unique(), size=round(len(stats_df['Year'].unique()) * .7))
train_years

array([1982., 1987., 2015., 1996., 2007., 2012., 2014., 2014., 2003.,
       1977., 1978., 2000., 1983., 2013., 2013., 1987., 2012., 2013.,
       2016., 2015., 1983., 1982., 1990., 2013., 2007., 1984., 2002.,
       1978., 1978., 1977., 1980.])

In [10]:
train = stats_df[stats_df['Year'].isin(train_years)]
test = stats_df[~stats_df['Year'].isin(train_years)]

In [11]:
train = train.drop(columns = ['Year'])
test = test.drop(columns = ['Year'])

In [12]:
X_train = train.drop(columns = ['was_mvp'])
y_train = train['was_mvp']

In [13]:
X_test = test.drop(columns = ['was_mvp'])
y_test = test['was_mvp']

In [14]:
def score_models(X_train, y_train, X_test, y_test):
    models = {'RandomForestClassifier':{'model':RFC()}, 
        'KNeighborsClassifier':{'model':KNeighborsClassifier()},
         'GradientBoostingClassifier': {'model':GBC()},
         'logistic': {'model':LogisticRegression(max_iter=10000)},
         'adaboost': {'model':ABC()},
         'ScaledRandomForestClassifier':{'model':make_pipeline(StandardScaler(), RFC())}, 
        'ScaledKNeighborsClassifier':{'model':make_pipeline(StandardScaler(), KNeighborsClassifier())},
         'ScaledGradientBoostingClassifier': {'model':make_pipeline(StandardScaler(), GBC())},
         'Scaledlogistic': {'model':make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000))},
         'Scaledadaboost': {'model':make_pipeline(StandardScaler(), ABC())}}
    for model_name in models:
        print(model_name)
        model = models[model_name]['model']
        fitted_model = model.fit(X_train, y_train)
        models[model_name]['train_f1_score'] = f1_score(y_train, fitted_model.predict(X_train), average='macro')
        models[model_name]['train_accuracy_score'] = accuracy_score(y_train, fitted_model.predict(X_train))
        models[model_name]['test_f1_score'] = f1_score(y_test, fitted_model.predict(X_test), average='macro')
        models[model_name]['test_accuracy_score'] = accuracy_score(y_test, fitted_model.predict(X_test))
        models[model_name]['fitted_model'] = fitted_model
        models[model_name]['y_train_pred'] = fitted_model.predict(X_train)
        models[model_name]['y_test_pred'] = fitted_model.predict(X_test) 
        tn, fp, fn, tp = confusion_matrix(y_test, models[model_name]['y_test_pred']).ravel()
        models[model_name]['tp'] = tp
        models[model_name]['fp'] = fp
        models[model_name]['fn'] = fn
        models[model_name]['tn'] = tn
        print(confusion_matrix(y_test, models[model_name]['y_test_pred']))
    return models


In [15]:
models = score_models(X_train, y_train, X_test, y_test)

RandomForestClassifier
[[12231     4]
 [   19     7]]
KNeighborsClassifier
[[12234     1]
 [   24     2]]
GradientBoostingClassifier
[[12219    16]
 [   18     8]]
logistic
[[12228     7]
 [   16    10]]
adaboost
[[12226     9]
 [   14    12]]
ScaledRandomForestClassifier
[[12233     2]
 [   19     7]]
ScaledKNeighborsClassifier
[[12235     0]
 [   26     0]]
ScaledGradientBoostingClassifier
[[12224    11]
 [   20     6]]
Scaledlogistic
[[12230     5]
 [   16    10]]
Scaledadaboost
[[12226     9]
 [   14    12]]


In [16]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
#model = models['ScaledGradientBoostingClassifier']['fitted_model']
selector = RFECV(DecisionTreeClassifier(), step=1, cv=5)
selector = selector.fit(X_train, y_train)
selector.support_
selector.ranking_

array([61, 60, 59, 58, 57,  5, 20, 21, 19, 16, 15, 14, 13, 23, 25, 26, 28,
       27,  1,  4, 31, 34, 36,  6, 40, 42, 49, 44, 46, 50, 52, 54, 56, 55,
       53, 48, 47, 45, 43, 39, 18, 17, 12, 11, 10, 22,  9,  3,  8, 24,  7,
        2, 29, 30, 32, 33, 35, 37, 38, 41, 51])

In [17]:
to_drop = []
for i in range(X_train.shape[1]):
    if selector.support_[i] == False:
        to_drop.append(X_train.iloc[:, i].name)
        print('Column: %s, Selected %s, Rank: %.3f' % (X_train.iloc[:, i].name, selector.support_[i], selector.ranking_[i]))

Column: Age, Selected False, Rank: 61.000
Column: G, Selected False, Rank: 60.000
Column: GS, Selected False, Rank: 59.000
Column: MP, Selected False, Rank: 58.000
Column: PER, Selected False, Rank: 57.000
Column: TS%, Selected False, Rank: 5.000
Column: 3PAr, Selected False, Rank: 20.000
Column: FTr, Selected False, Rank: 21.000
Column: ORB%, Selected False, Rank: 19.000
Column: DRB%, Selected False, Rank: 16.000
Column: TRB%, Selected False, Rank: 15.000
Column: AST%, Selected False, Rank: 14.000
Column: STL%, Selected False, Rank: 13.000
Column: BLK%, Selected False, Rank: 23.000
Column: TOV%, Selected False, Rank: 25.000
Column: USG%, Selected False, Rank: 26.000
Column: OWS, Selected False, Rank: 28.000
Column: DWS, Selected False, Rank: 27.000
Column: WS/48, Selected False, Rank: 4.000
Column: OBPM, Selected False, Rank: 31.000
Column: DBPM, Selected False, Rank: 34.000
Column: BPM, Selected False, Rank: 36.000
Column: VORP, Selected False, Rank: 6.000
Column: FG, Selected False,

In [18]:
for i in range(X_train.shape[1]):
    if selector.support_[i] == True:
        print('Column: %s, Selected %s, Rank: %.3f' % (X_train.iloc[:, i].name, selector.support_[i], selector.ranking_[i]))

Column: WS, Selected True, Rank: 1.000


In [19]:
X_selected_train = train.drop(columns = to_drop + ['was_mvp'])
y_selected_train = train['was_mvp']

In [20]:
X_selected_test = test.drop(columns = to_drop + ['was_mvp'])
y_selected_test = test['was_mvp']

In [21]:
feat_select_models = score_models(X_selected_train, y_selected_train, X_selected_test, y_selected_test)

RandomForestClassifier
[[12226     9]
 [   19     7]]
KNeighborsClassifier
[[12222    13]
 [   17     9]]
GradientBoostingClassifier
[[12199    36]
 [   16    10]]
logistic
[[12228     7]
 [   17     9]]
adaboost
[[12230     5]
 [   20     6]]
ScaledRandomForestClassifier
[[12225    10]
 [   18     8]]
ScaledKNeighborsClassifier
[[12222    13]
 [   17     9]]
ScaledGradientBoostingClassifier
[[12199    36]
 [   16    10]]
Scaledlogistic
[[12229     6]
 [   17     9]]
Scaledadaboost
[[12229     6]
 [   19     7]]


In [22]:
metrics = pd.DataFrame.from_dict(feat_select_models, 'index')[['train_f1_score', 'train_accuracy_score', 'test_f1_score', 'test_accuracy_score', 'tp', 'fp', 'fn', 'tn']]

In [23]:
metrics.sort_values(by='tp', ascending=False)

Unnamed: 0,train_f1_score,train_accuracy_score,test_f1_score,test_accuracy_score,tp,fp,fn,tn
GradientBoostingClassifier,0.754644,0.997306,0.637825,0.995759,10,36,16,12199
ScaledGradientBoostingClassifier,0.754644,0.997306,0.637825,0.995759,10,36,16,12199
KNeighborsClassifier,0.772287,0.998243,0.686887,0.997553,9,13,17,12222
logistic,0.740939,0.998243,0.713796,0.998043,9,7,17,12228
ScaledKNeighborsClassifier,0.772287,0.998243,0.686887,0.997553,9,13,17,12222
Scaledlogistic,0.749589,0.99836,0.719042,0.998124,9,6,17,12229
ScaledRandomForestClassifier,0.899824,0.999297,0.681246,0.997716,8,10,18,12225
RandomForestClassifier,0.899824,0.999297,0.666095,0.997716,7,9,19,12226
Scaledadaboost,0.856908,0.999063,0.678977,0.997961,7,6,19,12229
adaboost,0.856908,0.999063,0.661652,0.997961,6,5,20,12230


In [24]:
metrics = pd.DataFrame.from_dict(feat_select_models, 'index')[['train_f1_score', 'train_accuracy_score', 'test_f1_score', 'test_accuracy_score', 'tp', 'fp', 'fn', 'tn']]

In [25]:
metrics.sort_values(by='tp', ascending=False)

Unnamed: 0,train_f1_score,train_accuracy_score,test_f1_score,test_accuracy_score,tp,fp,fn,tn
GradientBoostingClassifier,0.754644,0.997306,0.637825,0.995759,10,36,16,12199
ScaledGradientBoostingClassifier,0.754644,0.997306,0.637825,0.995759,10,36,16,12199
KNeighborsClassifier,0.772287,0.998243,0.686887,0.997553,9,13,17,12222
logistic,0.740939,0.998243,0.713796,0.998043,9,7,17,12228
ScaledKNeighborsClassifier,0.772287,0.998243,0.686887,0.997553,9,13,17,12222
Scaledlogistic,0.749589,0.99836,0.719042,0.998124,9,6,17,12229
ScaledRandomForestClassifier,0.899824,0.999297,0.681246,0.997716,8,10,18,12225
RandomForestClassifier,0.899824,0.999297,0.666095,0.997716,7,9,19,12226
Scaledadaboost,0.856908,0.999063,0.678977,0.997961,7,6,19,12229
adaboost,0.856908,0.999063,0.661652,0.997961,6,5,20,12230


In [26]:
X_selected_train = train[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
y_selected_train = train['was_mvp']

In [27]:
X_selected_test = test[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
y_selected_test = test['was_mvp']

In [28]:
hand_select_models = score_models(X_selected_train, y_selected_train, X_selected_test, y_selected_test)

RandomForestClassifier
[[12229     6]
 [   17     9]]
KNeighborsClassifier
[[12228     7]
 [   21     5]]
GradientBoostingClassifier
[[12224    11]
 [   17     9]]
logistic
[[12225    10]
 [   15    11]]
adaboost
[[12225    10]
 [   18     8]]
ScaledRandomForestClassifier
[[12229     6]
 [   17     9]]
ScaledKNeighborsClassifier
[[12229     6]
 [   19     7]]
ScaledGradientBoostingClassifier
[[12225    10]
 [   19     7]]
Scaledlogistic
[[12225    10]
 [   14    12]]
Scaledadaboost
[[12225    10]
 [   19     7]]


In [29]:
metrics = pd.DataFrame.from_dict(hand_select_models, 'index')[['train_f1_score', 'train_accuracy_score', 'test_f1_score', 'test_accuracy_score', 'tp', 'fp', 'fn', 'tn']]

In [30]:
metrics.sort_values(by='tp', ascending=False)

Unnamed: 0,train_f1_score,train_accuracy_score,test_f1_score,test_accuracy_score,tp,fp,fn,tn
Scaledlogistic,0.766256,0.99836,0.74951,0.998043,12,10,14,12225
logistic,0.766256,0.99836,0.733532,0.997961,11,10,15,12225
RandomForestClassifier,1.0,1.0,0.719042,0.998124,9,6,17,12229
GradientBoostingClassifier,0.879105,0.99918,0.69508,0.997716,9,11,17,12224
ScaledRandomForestClassifier,1.0,1.0,0.719042,0.998124,9,6,17,12229
adaboost,1.0,1.0,0.681246,0.997716,8,10,18,12225
ScaledKNeighborsClassifier,0.691838,0.998126,0.678977,0.997961,7,6,19,12229
ScaledGradientBoostingClassifier,0.879105,0.99918,0.662198,0.997635,7,10,19,12225
Scaledadaboost,1.0,1.0,0.662198,0.997635,7,10,19,12225
KNeighborsClassifier,0.758878,0.998477,0.631007,0.997716,5,7,21,12228


In [104]:
df_2020 = pd.read_csv('../data/2020stats.csv')
df_2020_advanced = pd.read_csv('../data/2020advancedstats.csv')
l = [column for column in df_2020.columns if column in df_2020_advanced.columns]
l.remove('Player')
df_2020_advanced = df_2020_advanced.drop(columns = l)
df_2020['Player'] = df_2020['Player'].str.split('\\').str[0]
df_2020 = pd.merge(df_2020, df_2020_advanced, on='Player', how="inner")
df_2020.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48',
       'Unnamed: 24', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Player-additional'],
      dtype='object')

In [106]:
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
df_2020 = per_game(df_2020, per_game_cols)

In [107]:
X = df_2020[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]

In [108]:
results_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict(X)
results_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict(X)


In [110]:
df_2020['results_scaled_log'] = results_scaled_log
df_2020['results_RF'] = results_RF

In [113]:
df_2020[df_2020['results_RF'] == 1]

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,BLKPG,TOVPG,PFPG,FGAPG,FGPG,FTPG,FTAPG,MPPG,results_scaled_log,results_RF
559,260,Nikola Jokić,C,25,DEN,64,64,2243,654,1152,...,0.671875,3.125,2.625,18.0,10.21875,4.40625,5.125,35.046875,1,1


In [178]:
df_2020 = pd.read_csv('../data/2020stats.csv')
df_2020_advanced = pd.read_csv('../data/2020advancedstats.csv')
l = [column for column in df_2020.columns if column in df_2020_advanced.columns]
l.remove('Player')
df_2020_advanced = df_2020_advanced.drop(columns = l)
df_2020['Player'] = df_2020['Player'].str.split('\\').str[0]
df_2020 = pd.merge(df_2020, df_2021_advanced, on='Player', how="inner")
df_2020.columns
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
df_2020 = per_game(df_2020, per_game_cols)
X = df_2020[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
results_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict(X)
results_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict(X)
proba_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist()
proba_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict_proba(X).tolist()
mvp_log_probs = []
mvp_rf_probs = []
for prob in proba_scaled_log:
    #print(prob[1])
    mvp_log_probs.append(prob[1])
for prob in proba_RF:
    mvp_rf_probs.append(prob[1])
# print(hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist())
# print(proba_RF)
df_2020['results_scaled_log'] = results_scaled_log
df_2020['results_RF'] = results_RF
df_2020['proba_scaled_log'] = mvp_log_probs
df_2020['proba_RF'] = mvp_rf_probs

In [179]:
df_2020.sort_values(by='proba_scaled_log', ascending=False)[['Player', 'proba_scaled_log', 'proba_RF']].head(10)

Unnamed: 0,Player,proba_scaled_log,proba_RF
391,Nikola Jokić,0.672389,0.61
14,Giannis Antetokounmpo,0.387908,0.33
211,Joel Embiid,0.053502,0.07
255,Rudy Gobert,0.025989,0.0
289,James Harden,0.024044,0.0
705,Karl-Anthony Towns,0.019022,0.01
283,James Harden,0.016661,0.0
187,Luka Dončić,0.015007,0.06
372,LeBron James,0.009092,0.04
632,Domantas Sabonis,0.005398,0.0


In [161]:
df_2021 = pd.read_csv('../data/2021stats.csv')
df_2021_advanced = pd.read_csv('../data/2021advancedstats.csv')
l = [column for column in df_2021.columns if column in df_2021_advanced.columns]
l.remove('Player')
df_2021_advanced = df_2021_advanced.drop(columns = l)
df_2021['Player'] = df_2021['Player'].str.split('\\').str[0]
df_2021 = pd.merge(df_2021, df_2021_advanced, on='Player', how="inner")
df_2021.columns
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
df_2021 = per_game(df_2021, per_game_cols)
X = df_2021[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
results_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict(X)
results_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict(X)
proba_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist()
proba_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict_proba(X).tolist()
mvp_log_probs = []
mvp_rf_probs = []
for prob in proba_scaled_log:
    #print(prob[1])
    mvp_log_probs.append(prob[1])
for prob in proba_RF:
    mvp_rf_probs.append(prob[1])
# print(hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist())
# print(proba_RF)
df_2021['results_scaled_log'] = results_scaled_log
df_2021['results_RF'] = results_RF
df_2021['proba_scaled_log'] = mvp_log_probs
df_2021['proba_RF'] = mvp_rf_probs

In [162]:
df_2021[df_2021['results_RF'] == 1]
df_2021.sort_values(by='proba_RF', ascending=False)[['Player', 'proba_scaled_log', 'proba_RF']].head(10)

Unnamed: 0,Player,proba_scaled_log,proba_RF
715,Nikola Jokić,0.879997,0.69
27,Giannis Antetokounmpo,0.402524,0.33
292,Luka Dončić,0.022445,0.09
668,LeBron James,0.008517,0.09
344,Joel Embiid,0.148009,0.05
336,Kevin Durant,0.011616,0.04
261,Stephen Curry,0.001319,0.02
297,Sekou Doumbouya,5e-06,0.02
1350,Karl-Anthony Towns,0.007687,0.02
966,Ja Morant,0.001115,0.01


In [152]:
df_2022 = pd.read_csv('../data/2022stats.csv')
df_2022_advanced = pd.read_csv('../data/2022advancedstats.csv')
l = [column for column in df_2022.columns if column in df_2022_advanced.columns]
l.remove('Player')
df_2022_advanced = df_2022_advanced.drop(columns = l)
df_2022['Player'] = df_2022['Player'].str.split('\\').str[0]
df_2022 = pd.merge(df_2022, df_2022_advanced, on='Player', how="inner")
df_2022.columns
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
df_2022 = per_game(df_2022, per_game_cols)
X = df_2022[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
results_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict(X)
results_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict(X)
proba_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist()
proba_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict_proba(X).tolist()
mvp_log_probs = []
mvp_rf_probs = []
for prob in proba_scaled_log:
    #print(prob[1])
    mvp_log_probs.append(prob[1])
for prob in proba_RF:
    mvp_rf_probs.append(prob[1])
# print(hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist())
# print(proba_RF)
df_2022['results_scaled_log'] = results_scaled_log
df_2022['results_RF'] = results_RF
df_2022['proba_scaled_log'] = mvp_log_probs
df_2022['proba_RF'] = mvp_rf_probs

In [160]:
df_2022.sort_values(by='proba_RF', ascending=False)[['Player', 'proba_scaled_log', 'proba_RF']].head(10)

Unnamed: 0,Player,proba_scaled_log,proba_RF
114,Luka Dončić,0.049949,0.13
227,Nikola Jokić,0.083889,0.09
401,Domantas Sabonis,0.004905,0.08
98,Stephen Curry,0.001574,0.07
130,Joel Embiid,0.004275,0.07
10,Giannis Antetokounmpo,0.004763,0.06
124,Kevin Durant,0.002561,0.05
432,Jayson Tatum,0.00114,0.04
308,Donovan Mitchell,0.000122,0.04
100,Anthony Davis,0.003674,0.04


In [170]:
df_2019 = pd.read_csv('../data/2019stats.csv')
df_2019_advanced = pd.read_csv('../data/2019advancedstats.csv')
l = [column for column in df_2019.columns if column in df_2019_advanced.columns]
l.remove('Player')
df_2019_advanced = df_2019_advanced.drop(columns = l)
df_2019['Player'] = df_2019['Player'].str.split('\\').str[0]
df_2019 = pd.merge(df_2019, df_2019_advanced, on='Player', how="inner")
df_2019.columns
per_game_cols = ['PTS', 'AST', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV', 'PF', 'FGA', 'FG', 'FT', 'FTA', 'MP']
df_2019 = per_game(df_2019, per_game_cols)
X = df_2019[['PTSPG', 'ASTPG', 'WS', 'BLKPG', 'DRBPG', 'VORP', 'BPM', 'USG%', 'FGPG']]
results_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict(X)
results_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict(X)
proba_scaled_log = hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist()
proba_RF = hand_select_models['RandomForestClassifier']['fitted_model'].predict_proba(X).tolist()
mvp_log_probs = []
mvp_rf_probs = []
for prob in proba_scaled_log:
    #print(prob[1])
    mvp_log_probs.append(prob[1])
for prob in proba_RF:
    mvp_rf_probs.append(prob[1])
# print(hand_select_models['Scaledlogistic']['fitted_model'].predict_proba(X).tolist())
# print(proba_RF)
df_2019['results_scaled_log'] = results_scaled_log
df_2019['results_RF'] = results_RF
df_2019['proba_scaled_log'] = mvp_log_probs
df_2019['proba_RF'] = mvp_rf_probs

In [171]:
df_2019.sort_values(by='proba_scaled_log', ascending=False)[['Player', 'proba_scaled_log', 'proba_RF']].head(10)

Unnamed: 0,Player,proba_scaled_log,proba_RF
12,Giannis Antetokounmpo,0.415438,0.36
438,James Harden,0.111726,0.07
530,LeBron James,0.063813,0.05
309,Luka Dončić,0.048897,0.09
282,Anthony Davis,0.020311,0.03
616,Damian Lillard,0.01773,0.02
557,Nikola Jokić,0.016673,0.0
392,Rudy Gobert,0.011102,0.0
613,Kawhi Leonard,0.004987,0.02
834,Domantas Sabonis,0.004146,0.0


In [180]:
joblib.dump(hand_select_models['RandomForestClassifier']['fitted_model'], "mvp_random_forest.joblib")

['mvp_random_forest.joblib']

In [181]:
joblib.dump(hand_select_models['Scaledlogistic']['fitted_model'], "mvp_scaled_log.joblib")

['mvp_scaled_log.joblib']