In [1]:
import pandas as pd

from DataCleaning import NBA_data
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [2]:
NBA_data.corr()["Share"]

Age        0.008057
G          0.088301
GS         0.165025
MP         0.157000
FG         0.266440
FGA        0.240522
FG%        0.058825
3P         0.114873
3PA        0.114308
3P%        0.034410
2P         0.262198
2PA        0.238222
2P%        0.056144
eFG%       0.054662
FT         0.314490
FTA        0.319017
FT%        0.041911
ORB        0.086719
DRB        0.208287
TRB        0.179629
AST        0.223678
STL        0.165989
BLK        0.126691
TOV        0.245557
PF         0.064903
PTS        0.280857
Year      -0.007246
Pts Won    0.995744
Pts Max    0.534968
Share      1.000000
W          0.117644
L         -0.117014
W/L%       0.119169
GB        -0.093470
PS/G       0.040734
PA/G      -0.032792
SRS        0.114493
Name: Share, dtype: float64

In [3]:
# numerical data that contributes to MVP voting data
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
              '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
              'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
              'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [4]:
# initialize training and testing data for machine learning predictions
train = NBA_data[NBA_data["Year"] < 2022]
test = NBA_data[NBA_data["Year"] == 2022]

In [5]:
# ridge regression model
regression = Ridge(alpha=.1)
regression.fit(train[predictors], train["Share"])

Ridge(alpha=0.1)

In [6]:
# mvp prediction
predictions = regression.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [7]:
# concatenate with player's data
mvp_pred = pd.concat([test[["Player", "Share"]], predictions], axis=1)
mvp_pred = mvp_pred.sort_values("Share", ascending=False)

In [8]:
# Add real MVP voting outcome & predicted MVP voting outcome
actual = mvp_pred.sort_values("Share", ascending=False)
predicted = mvp_pred.sort_values("predictions", ascending=False)
actual["Rk"] = list(range(1,actual.shape[0]+1))
predicted["Predicted_Rk"] = list(range(1,predicted.shape[0]+1))

actual.merge(predicted, on = "Player")

Unnamed: 0,Player,Share_x,predictions_x,Rk,Share_y,predictions_y,Predicted_Rk
0,Nikola Jokić,0.875,0.186786,1,0.875,0.186786,2
1,Joel Embiid,0.706,0.176424,2,0.706,0.176424,3
2,Giannis Antetokounmpo,0.595,0.210251,3,0.595,0.210251,1
3,Devin Booker,0.216,0.085588,4,0.216,0.085588,15
4,Luka Dončić,0.146,0.162405,5,0.146,0.162405,4
...,...,...,...,...,...,...,...
600,Marvin Bagley III,0.000,0.003000,601,0.000,0.003000,224
601,Micah Potter,0.000,-0.022344,602,0.000,-0.022344,538
602,Rodney McGruder,0.000,-0.011705,603,0.000,-0.011705,441
603,Saben Lee,0.000,0.002713,604,0.000,0.002713,235


In [9]:
# error metric to show accuracy of top 5 MVP vote getters 
# average precision: show how close predicted ranking is to actual ranking
def find_ap(mvp_pred):
    actual = mvp_pred.sort_values("Share", ascending=False).head(5)
    predicted = mvp_pred.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found +=1
            ps.append(found/seen)
        seen +=1
    return sum(ps) / len(ps)

In [10]:
ap = find_ap(mvp_pred)
ap

0.8666666666666666

In [11]:
years = list(range(2000,2023))

In [12]:
aps = []
all_predictions = []
for year in years[10:]:
    train = NBA_data[NBA_data["Year"] < year]
    test = NBA_data[NBA_data["Year"] == year]
    regression.fit(train[predictors], train["Share"])
    predictions = regression.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    mvp_pred = pd.concat([test[["Player", "Share"]], predictions], axis=1)
    all_predictions.append(mvp_pred)
    aps.append(find_ap(mvp_pred))

In [13]:
# mean of average precision
sum(aps) / len(aps)

0.7285632229863

In [14]:
# show difference in predicted vs actual MVP rankings
def add_ranks(predictions):
    predictions = predictions.sort_values("predictions", ascending = False)
    predictions["Predicted_Rk"] = list(range(1, predictions.shape[0]+1))
    predictions = predictions.sort_values("Share", ascending = False)
    predictions["Rk"] = list(range(1, predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions                            

In [15]:
add_ranks(all_predictions[1])

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
9566,Derrick Rose,0.977,0.090340,6,1,-5
9053,Dwight Howard,0.531,0.168686,1,2,1
10195,LeBron James,0.431,0.150792,2,3,1
5689,Kobe Bryant,0.354,0.083423,7,4,-3
9827,Kevin Durant,0.157,0.082588,8,5,-3
...,...,...,...,...,...,...
2370,Willie Warren,0.000,0.008235,155,448,293
5691,Luke Walton,0.000,0.008305,154,449,295
1236,Lou Amundson,0.000,0.008375,153,450,297
9830,Nazr Mohammed,0.000,0.008387,152,451,299


In [16]:
def backtest(NBA_data, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = NBA_data[NBA_data["Year"] < year] 
        test = NBA_data[NBA_data["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        mvp_pred = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        mvp_pred = add_ranks(mvp_pred)
        all_predictions.append(mvp_pred)
        aps.append(find_ap(mvp_pred))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [17]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[10:], predictors)

mean_ap

0.7285632229863

In [30]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(20)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
464,Nikola Jokić,0.961,0.275712,5,1,-4
486,Nikola Jokić,0.875,0.367632,5,1,-4
3079,LeBron James,0.746,0.259336,5,2,-3
3010,Stephen Curry,0.449,0.277384,4,3,-1
8570,Giannis Antetokounmpo,0.595,0.406792,4,3,-1
9685,Giannis Antetokounmpo,0.952,0.806851,1,1,0
7377,Luka Dončić,0.198,0.275413,4,4,0
6968,Joel Embiid,0.58,0.307704,2,2,0
660,Joel Embiid,0.706,0.415694,2,2,0
7698,James Harden,0.363,0.461627,2,3,1


In [19]:
# impact of each predictor on the regression model
pd.concat([pd.Series(regression.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.070053,eFG%
18,0.037101,DRB
29,0.027605,W/L%
17,0.026402,ORB
15,0.011755,FTA
4,0.009191,FG
25,0.008862,PTS
22,0.00836,BLK
21,0.008246,STL
20,0.007772,AST


In [20]:
# additional predictors to increase average precision of MVP predictions
# ratio of player stats to league average that year
stat_ratios = NBA_data[["PTS", "AST", "STL", "BLK", "eFG%", "Year"]].groupby("Year").apply(lambda x:x/x.mean())

NBA_data[["PTS_R", "AST_R", "STL_R", "BLK_R", "eFG%_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "eFG%"]]

predictors += ["PTS_R", "AST_R", "STL_R", "BLK_R", "eFG%_R"]

In [21]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[5:], predictors)

mean_ap

0.69009426361904

In [22]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[10:], predictors)

mean_ap

0.7286092949829673

In [23]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[15:], predictors)

mean_ap

0.7930789752362334

In [24]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[18:], predictors)

mean_ap

0.7713708048224176

In [32]:
mean_ap, aps, all_predictions = backtest(NBA_data, regression, years[20:], predictors)

mean_ap

0.8233452807646356

In [33]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(20)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
730,Devin Booker,0.216,0.091041,12,4,-8
464,Nikola Jokić,0.961,0.126432,5,1,-4
3079,LeBron James,0.746,0.14566,4,2,-2
660,Joel Embiid,0.706,0.160338,4,2,-2
6968,Joel Embiid,0.58,0.128922,3,2,-1
3010,Stephen Curry,0.449,0.128749,4,3,-1
486,Nikola Jokić,0.875,0.19306,2,1,-1
9685,Giannis Antetokounmpo,0.952,0.218117,1,1,0
7698,James Harden,0.363,0.17281,2,3,1
7377,Luka Dončić,0.198,0.162684,3,4,1


In [26]:
rf = RandomForestRegressor(n_estimators = 500, random_state = 1, min_samples_split = 5)
mean_ap, aps, all_predictions = backtest(NBA_data, rf, years[15:], predictors)

mean_ap

0.7699305025040319

In [27]:
rf = RandomForestRegressor(n_estimators = 500, random_state = 1, min_samples_split = 5)
mean_ap, aps, all_predictions = backtest(NBA_data, rf, years[18:], predictors)

mean_ap

0.7949812409812409

In [28]:
rf = RandomForestRegressor(n_estimators = 500, random_state = 1, min_samples_split = 5)
mean_ap, aps, all_predictions = backtest(NBA_data, rf, years[20:], predictors)

mean_ap

0.8723232323232323

In [31]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(20)

Unnamed: 0,Player,Share,predictions,Predicted_Rk,Rk,Diff
464,Nikola Jokić,0.961,0.275712,5,1,-4
486,Nikola Jokić,0.875,0.367632,5,1,-4
3079,LeBron James,0.746,0.259336,5,2,-3
3010,Stephen Curry,0.449,0.277384,4,3,-1
8570,Giannis Antetokounmpo,0.595,0.406792,4,3,-1
9685,Giannis Antetokounmpo,0.952,0.806851,1,1,0
7377,Luka Dončić,0.198,0.275413,4,4,0
6968,Joel Embiid,0.58,0.307704,2,2,0
660,Joel Embiid,0.706,0.415694,2,2,0
7698,James Harden,0.363,0.461627,2,3,1
