In [2]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, SCORERS
import matplotlib as plt
%matplotlib inline
from matplotlib import rcParams
import pandas as pd
import numpy as np
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
# Loading in the dataframe, then removing the useless column
stats = pd.read_csv('stats_1.6.csv')
stats.drop(labels='Unnamed: 0', axis=1, inplace=True)
stats.head()

Unnamed: 0,Player,MVP,Year,Age,TmWin,G,GS,MP,PER,TS%,...,ORB/G,DRB/G,TRB/G,AST/G,STL/G,BLK/G,TOV/G,PF/G,PPG,Impact
0,A.C. Green,0,1986,22.0,62.0,82.0,1.0,1542.0,11.8,0.564,...,1.95,2.7,4.65,0.66,0.6,0.6,1.21,2.79,6.35,292.78725
1,A.C. Green,0,1987,23.0,65.0,79.0,72.0,2240.0,15.7,0.599,...,2.66,5.13,7.78,1.06,0.89,1.01,1.29,2.16,10.78,429.586585
2,A.C. Green,0,1988,24.0,62.0,82.0,64.0,2636.0,14.5,0.581,...,2.99,5.67,8.66,1.13,1.06,0.55,1.46,2.49,11.43,500.5105
3,A.C. Green,0,1989,25.0,57.0,82.0,82.0,2510.0,17.8,0.594,...,3.15,5.87,9.01,1.26,1.15,0.67,1.45,2.1,13.27,506.70625
4,A.C. Green,0,1990,26.0,63.0,82.0,82.0,2709.0,14.7,0.548,...,3.2,5.49,8.68,1.1,0.8,0.61,1.41,2.52,12.94,608.001188


In [4]:
stats.shape

(17148, 51)

In [402]:
# Preparing the predictors for Logistic Regression
# Naming the predictors and target variables X1, y1
X1 = stats[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
y1 = stats['MVP']

In [403]:
# Scaling the predictors using StandardScaler()
ss = StandardScaler()
ss.fit_transform(X1)

# Creating a training and testing set
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

# Part 1. Random Forest with SMOTE

In [404]:
# Oversampling the data with SMOTE
sm_rfc = SMOTE(sampling_strategy=0.6, random_state=7, k_neighbors=9)
smote_rfc_X1, smote_rfc_y1 = sm_rfc.fit_sample(X1_train, y1_train)


rfc1 = RandomForestClassifier(n_estimators=17, criterion='entropy', max_features=7, n_jobs=-1, random_state=0, max_depth=13)

# Fitting the model
rfct1 = rfc1.fit(smote_rfc_X1, smote_rfc_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfc1, smote_rfc_X1, smote_rfc_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfc1, smote_rfc_X1, smote_rfc_y1, cv=5)))

# Constructing the confusion matrix
predictions_rfc1 = rfc1.predict(X1_test)
predictions_proba_rfc1 = rfc1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfc1))
print(classification_report(y1_test, predictions_rfc1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfc1[:,1]))

Cross-Validation Scores: [0.96756126 0.97036173 0.96824656 0.96357693 0.96661219]
Mean Cross-Validation Score: 0.9672717326185231
[[3233  122]
 [  15   60]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3355
           1       0.33      0.80      0.47        75

   micro avg       0.96      0.96      0.96      3430
   macro avg       0.66      0.88      0.72      3430
weighted avg       0.98      0.96      0.97      3430

AUC Score 0.9636164927968207


# Part 1a. Testing for NBA's 2019 MVPs using Basketball-reference.com's list as a benchmark

In [405]:
testing_2019 = stats[stats['Year'] == 2019]
test_2019_X = testing_2019[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2019_y = testing_2019[['MVP']]

In [406]:
test_2019_X.head()

Unnamed: 0,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER
44,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1
55,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9
89,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8
194,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2
230,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2


In [407]:
predictions_rfc1_2019 = rfc1.predict(test_2019_X)
predictions_proba_rfc1_2019 = rfc1.predict_proba(test_2019_X)
print(confusion_matrix(test_2019_y, predictions_rfc1_2019))
print(classification_report(test_2019_y, predictions_rfc1_2019))
print('AUC Score', roc_auc_score(test_2019_y, predictions_proba_rfc1_2019[:,1]))

[[509  11]
 [  0  10]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       520
           1       0.48      1.00      0.65        10

   micro avg       0.98      0.98      0.98       530
   macro avg       0.74      0.99      0.82       530
weighted avg       0.99      0.98      0.98       530

AUC Score 0.9992307692307693


In [171]:
probability_rfc_2019 = pd.DataFrame(predictions_proba_rfc1_2019)
probability_rfc_2019 = probability_rfc_2019.loc[:,1:]
probability_rfc_2019.columns = ['Predicted MVP Probability']

predictions_rfc_2019 = pd.DataFrame(predictions_rfc1_2019)
predictions_rfc_2019.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2019_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_rfc_2019 = test_2019_X.copy(deep=True)
predictors_rfc_2019.reset_index(inplace=True)
predictors_rfc_2019.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [408]:
MVP_2019 = pd.concat([predictors_rfc_2019, real_mvp, predictions_rfc_2019, probability_rfc_2019], axis=1)
MVP_2019.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,44.0,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1,0,0.0,0.096648
1,55.0,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9,0,0.0,0.0
2,89.0,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8,0,0.0,0.0
3,194.0,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2,0,0.0,0.0
4,230.0,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2,0,0.0,0.0


In [409]:
# Checking for who was predicted as MVP by the model
MVP_2019[MVP_2019['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False)

# Players: 1249, 1789, 2885, 3350, 3452, 4138, 6065, 7150, 8247, 9039, 9160, 9402, 9856, 9877, 12329, 12348, 12597
# 12714, 14232, 15101

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
304,9402.0,11.5,2.88,1.99,0.631,5.86,4.3,1.08,25.99,6.37,885.110945,24.2,0,1.0,0.994624
454,15101.0,9.7,2.78,2.41,0.641,5.23,4.9,0.36,27.26,5.35,708.08389,24.4,0,1.0,0.994271
87,3452.0,12.1,2.65,1.85,0.588,6.89,5.4,0.43,25.84,4.64,895.758171,23.7,0,1.0,0.993744
407,12714.0,11.9,2.66,2.78,0.583,4.13,5.3,0.44,28.04,8.16,803.387306,23.3,0,1.0,0.985174
295,9039.0,9.5,2.02,1.45,0.606,3.32,3.6,0.4,26.6,7.32,546.508537,25.8,0,1.0,0.983634
323,9877.0,9.3,1.78,2.21,0.576,2.4,2.5,1.32,21.32,9.19,713.988329,22.9,0,1.0,0.98264
225,7150.0,15.2,4.96,3.13,0.616,7.51,9.9,0.74,36.13,6.64,1219.545755,30.6,0,1.0,0.981747
179,6065.0,14.4,3.72,3.22,0.644,5.89,7.6,1.53,27.69,12.47,835.939756,30.9,0,1.0,0.964024
388,12329.0,11.8,3.1,2.85,0.589,7.25,7.3,0.69,20.05,10.81,753.032195,26.3,0,1.0,0.95577
322,9856.0,9.1,2.57,2.49,0.592,6.93,4.7,0.51,23.82,5.0,546.6195,24.3,0,1.0,0.946983


# 2019 MVPs as predicted by Random Forest with SMOTE


# Elite Candidates where P => 0.9 
1. Kevin Durant (0.995)
2. Stephen Curry (0.994)
3. Damian Lillard (0.993)
4. Paul George (0.986)
5. Kawhi Leonard (0.984)
6. LaMarcus Aldridge (0.983)
7. James Harden (0.982)
8. Giannis Antetokounmpo (0.964)
9. Nikola Vucevic (0.956)
10. Kyrie Irving (0.947)
11. Joel Embid (0.905)


# Mid-tier Candidates  (0.6 <= P <= 0.89)
12. Kemba Walker (0.812)
13. DeMar Derozan (0.734)
14. Nikola Vucevic (0.703)
15. Chris Paul (0.692)
16. Ben Simmons (0.680)

# Pleb-tier Candidates (0.6 < P)
17. Bradley Beal
18. Pascal Siakam
19. D'Angelo Russell
20. Russell Westbrook

# Part 1b. Testing for 2018 MVPs using actual winners as a benchmark

In [204]:
testing_2018 = stats[stats['Year'] == 2018]
test_2018_X = testing_2018[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2018_y = testing_2018[['MVP']]

In [205]:
predictions_rfc1_2018 = rfc1.predict(test_2018_X)
predictions_proba_rfc1_2018 = rfc1.predict_proba(test_2018_X)
print(confusion_matrix(test_2018_y, predictions_rfc1_2018))
print(classification_report(test_2018_y, predictions_rfc1_2018))
print('AUC Score', roc_auc_score(test_2018_y, predictions_proba_rfc1_2018[:,1]))

[[514  10]
 [  0  13]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       524
           1       0.57      1.00      0.72        13

   micro avg       0.98      0.98      0.98       537
   macro avg       0.78      0.99      0.86       537
weighted avg       0.99      0.98      0.98       537

AUC Score 0.9995596007046389


In [206]:
probability_rfc_2018 = pd.DataFrame(predictions_proba_rfc1_2018)
probability_rfc_2018 = probability_rfc_2018.loc[:,1:]
probability_rfc_2018.columns = ['Predicted MVP Probability']

predictions_rfc_2018 = pd.DataFrame(predictions_rfc1_2018)
predictions_rfc_2018.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2018_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_rfc_2018 = test_2018_X.copy(deep=True)
predictors_rfc_2018.reset_index(inplace=True)
predictors_rfc_2018.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [207]:
MVP_2018 = pd.concat([predictors_rfc_2018, real_mvp, predictions_rfc_2018, probability_rfc_2018], axis=1)
MVP_2018.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,39,0.1,0.34,0.88,0.508,0.63,-0.1,0.0,2.34,0.53,14.371683,9.8,0,0,0.0
1,43,2.9,1.84,1.91,0.53,2.34,1.0,0.78,17.62,7.88,173.70639,16.5,0,0,0.0
2,54,-0.1,0.33,3.0,0.392,1.22,-0.1,0.22,6.67,2.67,1.981921,5.1,0,0,0.0
3,56,0.0,1.0,4.0,0.405,1.0,-0.1,0.0,8.0,3.0,0.079186,2.4,0,0,0.0
4,88,-0.1,0.71,0.94,0.439,0.54,-0.5,0.21,3.04,1.48,59.870854,5.1,0,0,0.0


In [212]:
# Checking for who was predicted as MVP by the model
MVP_2018[MVP_2018['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
324,10170,14.0,4.23,1.66,0.621,9.11,8.9,0.87,27.45,8.65,996.058333,28.6,1,1,0.994551
86,3451,12.6,2.82,1.6,0.594,6.59,5.9,0.37,26.88,4.45,742.500137,25.2,1,1,0.994059
513,16542,8.2,2.92,2.33,0.577,4.31,4.5,0.76,23.13,5.2,702.578049,23.1,1,1,0.992636
299,9401,10.4,3.04,1.96,0.64,5.38,4.5,1.75,26.35,6.82,708.236585,26.0,1,1,0.992606
217,7149,15.4,4.38,2.35,0.619,8.75,8.3,0.69,30.43,5.4,1094.98564,29.8,1,1,0.989958
29,821,13.7,2.16,2.12,0.612,2.32,4.9,2.57,28.13,11.09,748.262195,28.9,1,1,0.988426
111,4137,9.6,2.19,1.89,0.555,5.21,2.6,0.28,23.0,3.94,962.294797,21.0,1,1,0.981508
173,6064,11.9,2.97,3.08,0.598,4.81,5.4,1.41,26.85,10.04,720.929268,27.3,1,1,0.980331
317,9876,10.9,1.48,2.15,0.57,2.03,3.3,1.2,23.13,8.47,653.87944,25.0,1,1,0.968208
444,14231,10.1,4.76,2.5,0.524,10.25,7.5,0.25,25.35,10.05,969.438049,24.7,1,1,0.965856


# 2018 MVP List
Actual Rankings (In order):
1. James Harden
2. LeBron James
3. Anthony Davis
4. Damian Lillard
5. Russell Westbrook
6. Giannis Antetokounmpo
7. Kevin Durant
8. DeMar DeRozan
9. LaMarcus Aldridge
10. Jimmy Butler
11. Stephen Curry
12. Joel Embiid
13. Victor Oladipo

# 2018 MVPs as predicted by Random Forest with SMOTE

# Elite candidates (P >= 0.9)
1. LeBron James (0.995)
2. Damian Lillard (0.994)
3. Victor Oladipo (0.99263)
4. Kevin Durant (0.99260)
5. James Harden (0.990)
6. Anthony Davis (0.988)
7. DeMar Derozan (0.982)
8. Giannis Antetokounmpo (0.980)
9. LaMarcus Aldridge (0.968)
10. Rusell Westbrook (0.966)
11. Stephen Curry (0.965)
12. Jimmy Butler (0.923)

# Mid-tier Candidates  (0.6 <= P <= 0.89)
13. Chris Paul (0.874)
14. Ben Simmons (0.850)
15. Kyle Lowry (0.620)
16. Joel Embiid (0.618)
17. Paul George (0.609)
18. Nikola Jokic (0.602)

# Pleb-tier Candidates (0.6 < P)
19. Kemba Walker (0.598)
20. Kyrie Irving (0.577) [Surprising given that he sat out almost the whole season]
21. Draymond Green (0.537)
22. Bradley Beal (0.504)
23. Karl-Anthony Towns (0.502)




# Part 2. Logistic Regression with SMOTE

In [252]:
# Oversampling the data with SMOTE
sm_log = SMOTE(sampling_strategy=0.6, random_state=7, k_neighbors=9)
smote_log_X1, smote_log_y1 = sm_log.fit_sample(X1_train, y1_train)

In [253]:
# Specifying the optimal hyperparameters
log1 = LogisticRegression(penalty='l2', C=0.2)

# Fitting the model
logt1 = log1.fit(smote_log_X1, smote_log_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(logt1, smote_log_X1, smote_log_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(logt1, smote_log_X1, smote_log_y1, cv=5)))

# Constructing the confusion matrix
predictions_logt1 = logt1.predict(X1_test)
predictions_proba_logt1 = logt1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_logt1))
print(classification_report(y1_test, predictions_logt1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_logt1[:,1]))



Cross-Validation Scores: [0.92783746 0.92875496 0.93317757 0.92663551 0.9228972 ]




Mean Cross-Validation Score: 0.9278605406597171
[[3119  246]
 [   8   57]]
              precision    recall  f1-score   support

           0       1.00      0.93      0.96      3365
           1       0.19      0.88      0.31        65

   micro avg       0.93      0.93      0.93      3430
   macro avg       0.59      0.90      0.64      3430
weighted avg       0.98      0.93      0.95      3430

AUC Score 0.9629214767401989


# Part 2a. Testing for NBA's 2019 MVPs using Basketball-reference.com's list as a benchmark

In [254]:
testing_2019 = stats[stats['Year'] == 2019]
test_2019_X = testing_2019[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2019_y = testing_2019[['MVP']]

In [255]:
test_2019_X.head()

Unnamed: 0,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER
44,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1
55,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9
89,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8
194,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2
230,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2


In [256]:
predictions_log1_2019 = logt1.predict(test_2019_X)
predictions_proba_log1_2019 = logt1.predict_proba(test_2019_X)
print(confusion_matrix(test_2019_y, predictions_log1_2019))
print(classification_report(test_2019_y, predictions_log1_2019))
print('AUC Score', roc_auc_score(test_2019_y, predictions_proba_log1_2019[:,1]))

[[492  28]
 [  0  10]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       520
           1       0.26      1.00      0.42        10

   micro avg       0.95      0.95      0.95       530
   macro avg       0.63      0.97      0.69       530
weighted avg       0.99      0.95      0.96       530

AUC Score 0.9973076923076922


In [257]:
probability_log_2019 = pd.DataFrame(predictions_proba_log1_2019)
probability_log_2019 = probability_log_2019.loc[:,1:]
probability_log_2019.columns = ['Predicted MVP Probability']

predictions_log_2019 = pd.DataFrame(predictions_log1_2019)
predictions_log_2019.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2019_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_log_2019 = test_2019_X.copy(deep=True)
predictors_log_2019.reset_index(inplace=True)
predictors_log_2019.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [260]:
MVP_2019_log = pd.concat([predictors_log_2019, real_mvp, predictions_log_2019, probability_log_2019], axis=1)
MVP_2019_log.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,44,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1,0,0,0.334005
1,55,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9,0,0,0.003992
2,89,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8,0,0,0.002118
3,194,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2,0,0,0.455162
4,230,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2,0,0,0.06813


In [302]:
# Checking for who was predicted as MVP by the model
MVP_2019_log[MVP_2019_log['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False).head(38)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
225,7150,15.2,4.96,3.13,0.616,7.51,9.9,0.74,36.13,6.64,1219.545755,30.6,1,1,0.999643
179,6065,14.4,3.72,3.22,0.644,5.89,7.6,1.53,27.69,12.47,835.939756,30.9,1,1,0.998346
304,9402,11.5,2.88,1.99,0.631,5.86,4.3,1.08,25.99,6.37,885.110945,24.2,1,1,0.984991
388,12329,11.8,3.1,2.85,0.589,7.25,7.3,0.69,20.05,10.81,753.032195,26.3,1,1,0.984024
87,3452,12.1,2.65,1.85,0.588,6.89,5.4,0.43,25.84,4.64,895.758171,23.7,1,1,0.984008
433,14232,6.8,4.45,3.36,0.501,10.74,5.6,0.45,22.95,11.05,738.546898,21.1,1,1,0.983903
330,10171,7.2,3.58,1.71,0.588,8.25,4.9,0.6,27.36,8.45,316.465249,25.6,0,1,0.980628
258,8247,8.7,3.53,3.3,0.593,3.66,3.3,1.91,27.52,13.61,594.81922,26.1,1,1,0.979722
407,12714,11.9,2.66,2.78,0.583,4.13,5.3,0.44,28.04,8.16,803.387306,23.3,1,1,0.969254
26,822,9.5,2.0,2.36,0.597,3.89,4.9,2.41,25.93,12.0,256.23628,30.3,0,1,0.968562


# 2019 MVPs as predicted by Logistic Regression with SMOTE


# Elite Candidates where P => 0.9 
1. James Harden (0.999)
2. Giannis Antetokounmpo (0.998)
3. Kevin Durant (0.985)
4. Nikola Jokic (0.984)
5. Damian Lillard (0.984)
6. Russell Westbrook (0.983)
7. LeBron James (0.980)
8. Joel Embiid (0.979)
9. Paul George (0.969)
10. Anthony Davis (0.968)
11. Nikola Vucevic (0.950)
12. Karl-Anthony Towns (0.947)
13. Rudy Gobert (0.943)
14. Ben Simmons (0.926)
15. Kawhi Leonard (0.920)
16. Stephen Curry (0.919)
17. Blake Griffin (0.094)

# Mid-tier Candidates  (0.6 <= P <= 0.89)
18. Kemba Walker (0.895)
19. Kyrie Irving (0.894)
20. Andre Drummond (0.876)
21. LaMarcus Aldridge (0.852)
22. Luka Doncic (0.817)
23. Bradley Beal (0.800)
24. Jrue Holiday (0.790)
25. Clint Capela (0.771)
26. D'Angelo Russell (0.766)
27. DeMar DeRozen (0.761)
28. Trae Young (0.739)
29. Mike Conley (0.715)
30. Eric Bledsoe (0.635)
31. Kyle Lowry (0.630)
32. Lou Williams (0.628)
33. Devin Booker (0.620)
34. Chris Paul (0.602)

# Pleb-tier Candidates (0.6 < P)
35. Jusuf Nurkic (0.593)
36. De'Aaron Fox (0.561)
37. John Wall (0.554) [What the hell though, guy breaks his leg in his own bathroom and missed almost the whole season]
38. Donovan Mitchell (0.506)

# Part 2b. Testing for 2018 MVPs using actual winners as a benchmark

In [297]:
testing_2018 = stats[stats['Year'] == 2018]
test_2018_X = testing_2018[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2018_y = testing_2018[['MVP']]

In [298]:
predictions_log1_2018 = logt1.predict(test_2018_X)
predictions_proba_log1_2018 = logt1.predict_proba(test_2018_X)
print(confusion_matrix(test_2018_y, predictions_log1_2018))
print(classification_report(test_2018_y, predictions_log1_2018))
print('AUC Score', roc_auc_score(test_2018_y, predictions_proba_log1_2018[:,1]))

[[506  18]
 [  0  13]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       524
           1       0.42      1.00      0.59        13

   micro avg       0.97      0.97      0.97       537
   macro avg       0.71      0.98      0.79       537
weighted avg       0.99      0.97      0.97       537

AUC Score 0.9951556077510275


In [299]:
probability_log_2018 = pd.DataFrame(predictions_proba_log1_2018)
probability_log_2018 = probability_log_2018.loc[:,1:]
probability_log_2018.columns = ['Predicted MVP Probability']

predictions_log_2018 = pd.DataFrame(predictions_log1_2018)
predictions_log_2018.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2018_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_log_2018 = test_2018_X.copy(deep=True)
predictors_log_2018.reset_index(inplace=True)
predictors_log_2018.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [300]:
MVP_2018_log = pd.concat([predictors_log_2018, real_mvp, predictions_log_2018, probability_log_2018], axis=1)
MVP_2018_log.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,39,0.1,0.34,0.88,0.508,0.63,-0.1,0.0,2.34,0.53,14.371683,9.8,0,0,0.001206
1,43,2.9,1.84,1.91,0.53,2.34,1.0,0.78,17.62,7.88,173.70639,16.5,0,0,0.16741
2,54,-0.1,0.33,3.0,0.392,1.22,-0.1,0.22,6.67,2.67,1.981921,5.1,0,0,0.000835
3,56,0.0,1.0,4.0,0.405,1.0,-0.1,0.0,8.0,3.0,0.079186,2.4,0,0,0.000613
4,88,-0.1,0.71,0.94,0.439,0.54,-0.5,0.21,3.04,1.48,59.870854,5.1,0,0,0.002213


In [313]:
# Checking for who was predicted as MVP by the model
MVP_2018_log[MVP_2018_log['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
324,10170,14.0,4.23,1.66,0.621,9.11,8.9,0.87,27.45,8.65,996.058333,28.6,1,1,0.999538
217,7149,15.4,4.38,2.35,0.619,8.75,8.3,0.69,30.43,5.4,1094.98564,29.8,1,1,0.999495
444,14231,10.1,4.76,2.5,0.524,10.25,7.5,0.25,25.35,10.05,969.438049,24.7,1,1,0.998079
29,821,13.7,2.16,2.12,0.612,2.32,4.9,2.57,28.13,11.09,748.262195,28.9,1,1,0.995179
86,3451,12.6,2.82,1.6,0.594,6.59,5.9,0.37,26.88,4.45,742.500137,25.2,1,1,0.989263
173,6064,11.9,2.97,3.08,0.598,4.81,5.4,1.41,26.85,10.04,720.929268,27.3,1,1,0.986206
299,9401,10.4,3.04,1.96,0.64,5.38,4.5,1.75,26.35,6.82,708.236585,26.0,1,1,0.983938
39,1248,9.2,3.43,2.6,0.557,8.16,4.6,0.86,15.79,8.14,651.956817,20.0,0,1,0.949986
289,9029,14.0,1.94,3.48,0.646,2.43,5.5,1.4,21.26,12.34,654.300708,24.9,0,1,0.94754
396,12328,10.7,2.8,2.83,0.603,6.11,5.6,0.81,18.47,10.71,518.206479,24.4,0,1,0.947365


# 2018 MVPs as predicted by Logistic Regression with SMOTE


# Elite Candidates where P => 0.9 
1. LeBron James (0.999)
2. James Harden (0.999)
3. Rusell Westbrook (0.998)
4. Anthony Davis (0.995)
5. Damian Lillard (0.989)
6. Giannis Antetokounmpo (0.986)
7. Kevin Durant (0.984)
8. Ben Simmons (0.949)
9. Karl-Anthony Towns (0.947)
10. Nikola Jokic (0.940)
11. DeMarcus Cousins (0.925)
12. Stephen Curry (0.908)


# Mid-tier Candidates  (0.6 <= P <= 0.89)
13. LaMarcus Aldrige (0.895)
14. DeMar DeRozan (0.892)
15. Joel Embiid (0.875)
16. Chris Paul (0.866)
17. Victor Oladipo (0.856)
18. Kemba Walker (0.850)
19. Jimmy Butler (0.838)
18. Kyrie Irving (0.820)
19. Lou Williams (0.811)
20. Kyle Lowry (0.809)
21. John Wall (0.790)
22. Bradley Beal (0.738)
23. Paul George (0.717)
24. Jrue Holiday (0.683)
25. Clint Capela (0.644)
26. Draymond Green (0.623)
27. Dwight Howard (0.618) [What the hell lol]


# Pleb-tier Candidates (0.6 < P)
28. Al Horford (0.573)

# Part 3. Balanced Bagging Classifer with Logistic Regression Base

In [346]:
bbc1 = BalancedBaggingClassifier(base_estimator=LogisticRegression(penalty='l2', C=0.2), bootstrap=False,
                                 max_features=9, max_samples=900, n_estimators=11, replacement=True,
                                 random_state=0, sampling_strategy=0.51)

# Fitting the model
bbc1 = bbc1.fit(X1_train, y1_train)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(bbc1, X1_train, y1_train, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(bbc1, X1_train, y1_train, cv=5)))

# Constructing the confusion matrix
predictions_bbc1 = bbc1.predict(X1_test)
predictions_proba_bbc1 = bbc1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_bbc1))
print(classification_report(y1_test, predictions_bbc1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_bbc1[:,1]))



Cross-Validation Scores: [0.89180328 0.89540816 0.9274517  0.89281808 0.88844331]
Mean Cross-Validation Score: 0.8991849059627676
[[3000  365]
 [  13   52]]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94      3365
           1       0.12      0.80      0.22        65

   micro avg       0.89      0.89      0.89      3430
   macro avg       0.56      0.85      0.58      3430
weighted avg       0.98      0.89      0.93      3430

AUC Score 0.9301863070065151




# Part 3a. Testing for NBA's 2019 MVPs using Basketball-reference.com's list as a benchmark

In [347]:
testing_2019 = stats[stats['Year'] == 2019]
test_2019_X = testing_2019[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2019_y = testing_2019[['MVP']]

In [348]:
predictions_bbc1_2019 = bbc1.predict(test_2019_X)
predictions_proba_bbc1_2019 = bbc1.predict_proba(test_2019_X)
print(confusion_matrix(test_2019_y, predictions_bbc1_2019))
print(classification_report(test_2019_y, predictions_bbc1_2019))
print('AUC Score', roc_auc_score(test_2019_y, predictions_proba_bbc1_2019[:,1]))

[[478  42]
 [  0  10]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       520
           1       0.19      1.00      0.32        10

   micro avg       0.92      0.92      0.92       530
   macro avg       0.60      0.96      0.64       530
weighted avg       0.98      0.92      0.95       530

AUC Score 0.9876923076923076


In [349]:
probability_bbc_2019 = pd.DataFrame(predictions_proba_bbc1_2019)
probability_bbc_2019 = probability_bbc_2019.loc[:,1:]
probability_bbc_2019.columns = ['Predicted MVP Probability']

predictions_bbc_2019 = pd.DataFrame(predictions_bbc1_2019)
predictions_bbc_2019.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2019_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_bbc_2019 = test_2019_X.copy(deep=True)
predictors_bbc_2019.reset_index(inplace=True)
predictors_bbc_2019.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [350]:
MVP_2019_bbc = pd.concat([predictors_bbc_2019, real_mvp, predictions_bbc_2019, probability_bbc_2019], axis=1)
MVP_2019_bbc.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,44,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1,0,1,0.564416
1,55,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9,0,0,0.060062
2,89,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8,0,0,0.078163
3,194,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2,0,0,0.400248
4,230,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2,0,0,0.415323


In [359]:
# Checking for who was predicted as MVP by the model
MVP_2019_bbc[MVP_2019_bbc['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False).head(20)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
225,7150,15.2,4.96,3.13,0.616,7.51,9.9,0.74,36.13,6.64,1219.545755,30.6,1,1,0.987668
87,3452,12.1,2.65,1.85,0.588,6.89,5.4,0.43,25.84,4.64,895.758171,23.7,1,1,0.960631
136,4798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030401,-38.1,0,1,0.941156
179,6065,14.4,3.72,3.22,0.644,5.89,7.6,1.53,27.69,12.47,835.939756,30.9,1,1,0.930541
407,12714,11.9,2.66,2.78,0.583,4.13,5.3,0.44,28.04,8.16,803.387306,23.3,1,1,0.930245
304,9402,11.5,2.88,1.99,0.631,5.86,4.3,1.08,25.99,6.37,885.110945,24.2,1,1,0.922092
236,7531,-0.1,0.43,0.57,0.1155,0.715,-0.05,0.0,0.43,0.855,0.299832,-18.7,0,1,0.894184
388,12329,11.8,3.1,2.85,0.589,7.25,7.3,0.69,20.05,10.81,753.032195,26.3,1,1,0.878897
465,15481,-0.1,0.5,0.5,0.225,0.0,-0.1,0.0,1.0,2.0,0.063557,-15.8,0,1,0.864574
454,15101,9.7,2.78,2.41,0.641,5.23,4.9,0.36,27.26,5.35,708.08389,24.4,1,1,0.856034


# 2019 MVPs as predicted by Balanced Bagging Classifier


# Elite Candidates where P => 0.9 
1. James Harden (0.987)
2. Damian Lillard (0.961)
3. Donte Grantham (0.941) [Negative Statistics and stats inflated by low attempts]
4. Giannis Antetokounmpo (0.931)
5. Paul George (0.930)
6. Kevin Durant (0.922)

# Mid-tier Candidates  (0.6 <= P <= 0.89)
7. Jawun Evans (0.894) [Negative Statistics and stats inflated by low attempts]
8. Nikola Jokic (0.878)
9. Terrance Jones (0.865) [Negative Statistics and stats inflated by low attempts]
10. Stephen Curry (0.856)
11. Andre Ingram (0.856) [Negative Statistics and stats inflated by low attempts]
12. Kemba Walker (0.834)
13. Russell Westbrook (0.827)
14. Blake Griffin (0.817)
15. Zach Lofton (0.811) [Negative Statistics and stats inflated by low attempts]

# Pleb-tier Candidates (0.6 < P)
Model is not performing good enough for me to continue


In summary, BBC approach does not produce good results. It is unable to differentiate between positive and negative player stats. The increase in number of false negatives likely attributed to the aforementioned weakness of BBC.

Moving forward, I should probably try to remove these outliers in the dataset during the cleaning process and try BBC again.

# Part 3b. Testing for 2018 MVPs using actual winners as a benchmark

In [371]:
testing_2018 = stats[stats['Year'] == 2018]
test_2018_X = testing_2018[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2018_y = testing_2018[['MVP']]

In [372]:
predictions_bbc1_2018 = bbc1.predict(test_2018_X)
predictions_proba_bbc1_2018 = bbc1.predict_proba(test_2018_X)
print(confusion_matrix(test_2018_y, predictions_bbc1_2018))
print(classification_report(test_2018_y, predictions_bbc1_2018))
print('AUC Score', roc_auc_score(test_2018_y, predictions_proba_bbc1_2018[:,1]))

[[483  41]
 [  1  12]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       524
           1       0.23      0.92      0.36        13

   micro avg       0.92      0.92      0.92       537
   macro avg       0.61      0.92      0.66       537
weighted avg       0.98      0.92      0.94       537

AUC Score 0.9743100411039343


In [373]:
probability_bbc_2018 = pd.DataFrame(predictions_proba_bbc1_2018)
probability_bbc_2018 = probability_bbc_2018.loc[:,1:]
probability_bbc_2018.columns = ['Predicted MVP Probability']

predictions_bbc_2018 = pd.DataFrame(predictions_bbc1_2018)
predictions_bbc_2018.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2018_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_bbc_2018 = test_2018_X.copy(deep=True)
predictors_bbc_2018.reset_index(inplace=True)
predictors_bbc_2018.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [374]:
MVP_2018_bbc = pd.concat([predictors_bbc_2018, real_mvp, predictions_bbc_2018, probability_bbc_2018], axis=1)
MVP_2018_bbc.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,39,0.1,0.34,0.88,0.508,0.63,-0.1,0.0,2.34,0.53,14.371683,9.8,0,0,0.057577
1,43,2.9,1.84,1.91,0.53,2.34,1.0,0.78,17.62,7.88,173.70639,16.5,0,0,0.198948
2,54,-0.1,0.33,3.0,0.392,1.22,-0.1,0.22,6.67,2.67,1.981921,5.1,0,0,0.116255
3,56,0.0,1.0,4.0,0.405,1.0,-0.1,0.0,8.0,3.0,0.079186,2.4,0,0,0.196672
4,88,-0.1,0.71,0.94,0.439,0.54,-0.5,0.21,3.04,1.48,59.870854,5.1,0,0,0.130965


In [380]:
# Checking for who was predicted as MVP by the model
MVP_2018_bbc[MVP_2018_bbc['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False).head(10)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
324,10170,14.0,4.23,1.66,0.621,9.11,8.9,0.87,27.45,8.65,996.058333,28.6,1,1,0.989801
217,7149,15.4,4.38,2.35,0.619,8.75,8.3,0.69,30.43,5.4,1094.98564,29.8,1,1,0.988864
444,14231,10.1,4.76,2.5,0.524,10.25,7.5,0.25,25.35,10.05,969.438049,24.7,1,1,0.960055
86,3451,12.6,2.82,1.6,0.594,6.59,5.9,0.37,26.88,4.45,742.500137,25.2,1,1,0.956113
379,11951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00644,-41.1,0,1,0.941822
71,2703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006469,-28.5,0,1,0.935312
331,10485,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026317,-15.5,0,1,0.88693
313,9823,10.2,2.35,2.46,0.598,6.88,5.0,0.24,16.24,5.56,636.832149,19.5,0,1,0.879561
29,821,13.7,2.16,2.12,0.612,2.32,4.9,2.57,28.13,11.09,748.262195,28.9,1,1,0.865721
275,8812,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.005561,-12.5,0,1,0.855061


In [377]:
# In addition, checking who got left out by the model
# Joel Embiid got left out for some reason
MVP_2018_bbc[(MVP_2018_bbc['Predicted MVP'] == 0) & (MVP_2018_bbc['Real MVP'] == 1)]

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
251,8246,6.2,3.71,3.32,0.573,3.16,2.2,1.76,22.94,10.95,531.524341,22.9,1,0,0.334877


# 2018 MVPs as predicted by Logistic Regression with SMOTE


# Elite Candidates where P => 0.9 
1. LeBron James (0.989)
2. James Harden (0.988)
3. Russell Westbrook (0.960)
4. Damiam Lillard (0.956)
5. Mindaugas Kuzminskas (0.942) [Negative Statistics and stats inflated by low attempts]
6. Chris Boucher (0.935) [Negative Statistics and stats inflated by low attempts]


# Mid-tier Candidates  (0.6 <= P <= 0.89)
7. Luis Montero (0.887) [Negative Statistics and stats inflated by low attempts]
8. Kyle Lowry (0.880)
9. Anthony Davis (0.865)
10. Josh McRoberts (0.855) [Negative Statistics and stats inflated by low attempts]



# Pleb-tier Candidates (0.6 < P)



# Left out by model
Joel Embiid


Taking a deeper look at the stats of the mispredicted players, most of them either have highly negative stats or have alot of 0s in their stats record cause of lack of playing time/attempts. BBC seems to attach undue importance to negative numbers or zeroes.

# Part 4. Balanced Bagging Classifer with XGBoost

In [413]:
bbc_xgb1 = BalancedBaggingClassifier(base_estimator=XGBClassifier(learning_rate=0.09, n_estimators=500, max_depth=2, min_child_weight=6,
                                                objective='binary:logistic', subsample=0.5, colsample_by_tree = 0.6,
                                                nthread=4, scale_post_weight=1, seed=27, gamma=0, reg_alpha=0.1), 
                                sampling_strategy='auto', replacement=True, random_state=0)


bbc_xgb1.fit(X1_train, y1_train)

predictions_bbc_xgb1 = bbc_xgb1.predict(X1_test)
predict_proba_bbc_xgb1 = bbc_xgb1.predict_proba(X1_test)

print("Accuracy : %.4g" % accuracy_score(y1_test, predictions_bbc_xgb1))
print("AUC Score : %f" % roc_auc_score(y1_test, predict_proba_bbc_xgb1[:,1]))


print(confusion_matrix(y1_test, predictions_bbc_xgb1))
print(classification_report(y1_test, predictions_bbc_xgb1))
print('AUC Score', roc_auc_score(y1_test, predict_proba_bbc_xgb1[:,1]))

Accuracy : 0.9169
AUC Score : 0.972050
[[3076  279]
 [   6   69]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      3355
           1       0.20      0.92      0.33        75

   micro avg       0.92      0.92      0.92      3430
   macro avg       0.60      0.92      0.64      3430
weighted avg       0.98      0.92      0.94      3430

AUC Score 0.9720496770988575


# Part 4a. Testing for NBA's 2019 MVPs using Basketball-reference.com's list as a benchmark

In [414]:
testing_2019 = stats[stats['Year'] == 2019]
test_2019_X = testing_2019[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2019_y = testing_2019[['MVP']]

In [415]:
predictions_bbc_xgb_2019 = bbc_xgb1.predict(test_2019_X)
predictions_proba_bbc_xgb_2019 = bbc_xgb1.predict_proba(test_2019_X)
print(confusion_matrix(test_2019_y, predictions_bbc_xgb_2019))
print(classification_report(test_2019_y, predictions_bbc_xgb_2019))
print('AUC Score', roc_auc_score(test_2019_y, predictions_proba_bbc_xgb_2019[:,1]))

[[487  33]
 [  0  10]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       520
           1       0.23      1.00      0.38        10

   micro avg       0.94      0.94      0.94       530
   macro avg       0.62      0.97      0.67       530
weighted avg       0.99      0.94      0.96       530

AUC Score 0.9959615384615383


In [416]:
probability_bbc_xgb_2019 = pd.DataFrame(predictions_proba_bbc_xgb_2019)
probability_bbc_xgb_2019 = probability_bbc_xgb_2019.loc[:,1:]
probability_bbc_xgb_2019.columns = ['Predicted MVP Probability']

predictions_bbc_xgb_2019 = pd.DataFrame(predictions_bbc_xgb_2019)
predictions_bbc_xgb_2019.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2019_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_bbc_xgb_2019 = test_2019_X.copy(deep=True)
predictors_bbc_xgb_2019.reset_index(inplace=True)
predictors_bbc_xgb_2019.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [418]:
MVP_2019_bbc_xgb = pd.concat([predictors_bbc_xgb_2019, real_mvp, predictions_bbc_xgb_2019, probability_bbc_xgb_2019], axis=1)
MVP_2019_bbc_xgb.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,44,5.1,2.08,2.21,0.538,3.71,2.0,0.72,15.97,7.36,477.745006,15.1,0,0,0.279497
1,55,0.9,0.8,1.42,0.518,1.74,-0.1,0.26,5.88,1.34,86.264634,11.9,0,0,0.006903
2,89,0.9,0.43,1.11,0.522,0.33,-0.5,0.2,3.95,1.9,79.580606,8.8,0,0,0.007339
3,194,7.5,1.5,1.85,0.605,4.16,3.4,1.26,13.6,6.74,315.673985,20.2,0,1,0.573858
4,230,5.8,0.89,1.77,0.568,1.28,1.7,0.41,9.38,7.53,342.484546,13.2,0,0,0.050289


In [443]:
# Checking for who was predicted as MVP by the model
MVP_2019_bbc_xgb[MVP_2019_bbc_xgb['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False).head(43)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
433,14232,6.8,4.45,3.36,0.501,10.74,5.6,0.45,22.95,11.05,738.546898,21.1,1,1,0.990738
304,9402,11.5,2.88,1.99,0.631,5.86,4.3,1.08,25.99,6.37,885.110945,24.2,1,1,0.989396
179,6065,14.4,3.72,3.22,0.644,5.89,7.6,1.53,27.69,12.47,835.939756,30.9,1,1,0.986852
388,12329,11.8,3.1,2.85,0.589,7.25,7.3,0.69,20.05,10.81,753.032195,26.3,1,1,0.98228
454,15101,9.7,2.78,2.41,0.641,5.23,4.9,0.36,27.26,5.35,708.08389,24.4,1,1,0.981205
225,7150,15.2,4.96,3.13,0.616,7.51,9.9,0.74,36.13,6.64,1219.545755,30.6,1,1,0.978103
87,3452,12.1,2.65,1.85,0.588,6.89,5.4,0.43,25.84,4.64,895.758171,23.7,1,1,0.974
299,9160,7.4,2.57,1.6,0.558,5.9,3.9,0.41,25.63,4.4,732.749063,21.7,0,1,0.966839
390,12348,10.1,1.99,1.96,0.573,3.84,5.3,1.11,20.81,12.0,599.95122,25.5,0,1,0.963551
258,8247,8.7,3.53,3.3,0.593,3.66,3.3,1.91,27.52,13.61,594.81922,26.1,1,1,0.954142


# 2019 MVPs as predicted by Balanced Bagging Classifier with XGBoost


# Elite Candidates where P => 0.9 
1. Russell Westbrook (0.990)
2. Kevin Durant (0.989)
3. Giannis Antetokounmpo (0.986)
4. Nikola Jokic (0.982)
5. Stephen Curry (0.981)
6. James Harden (0.978)
7. Damian Lillard (0.974)
8. Kemba Walker (0.966)
9. Nikola Vucevic (0.963)
10. Joel Embiid (0.954)
11. Karl-Anthony Towns (0.953)
12. Kyrie Irving (0.952)
13. LeBron James (0.950)
14. Paul George (0.944)
15. LaMarcus Aldridge (0.942)
16. Jrue Holiday (0.939)
17. D'Angelo Russell (0.933)
18. Anthony Davis (0.932)
19. Bradley Beal (0.924)
20. DeMar DeRozan (0.920)
21. Blake Griffin (0.920)
22. Ben Simmons (0.917)
23. Andre Drummond (0.910)
24. Kawhi Leonard (0.901)


# Mid-tier Candidates  (0.6 <= P <= 0.89)
25. Eric Bledsoe (0.899)
26. Mike Conley (0.857)
27. Julius Randle (0.855)
28. Lou Williams (0.843)
29. Luka Doncic (0.839)
30. Pascal Siakam (0.783)
31. Khris Middleton (0.776)
32. Chris Paul (0.740)
33. Jusuf Nurkic (0.733)
34. Motrezl Harrell (0.704)
35. Clint Capela (0.703)
36. John Wall (0.682)
37. Danilo Gallinari (0.654)
38. De'Aaron Fox (0.642)
39. Rudy Gobert (0.636)
40. John Collins (0.622)

# Pleb-tier Candidates (0.6 < P)
41. Victor Oladipo (0.577)
42. Al Horford (0.573)
43. Jamal Murray (0.535)

# Part 4b. Testing for 2018 MVPs using actual winners as a benchmark

In [468]:
testing_2018 = stats[stats['Year'] == 2018]
test_2018_X = testing_2018[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
test_2018_y = testing_2018[['MVP']]

In [469]:
predictions_bbc_xgb_2018 = bbc_xgb1.predict(test_2018_X)
predictions_proba_bbc_xgb_2018 = bbc_xgb1.predict_proba(test_2018_X)
print(confusion_matrix(test_2018_y, predictions_bbc_xgb_2018))
print(classification_report(test_2018_y, predictions_bbc_xgb_2018))
print('AUC Score', roc_auc_score(test_2018_y, predictions_proba_bbc_xgb_2018[:,1]))

[[501  23]
 [  0  13]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       524
           1       0.36      1.00      0.53        13

   micro avg       0.96      0.96      0.96       537
   macro avg       0.68      0.98      0.75       537
weighted avg       0.98      0.96      0.97       537

AUC Score 0.9975044039929536


In [470]:
probability_bbc_xgb_2018 = pd.DataFrame(predictions_proba_bbc_xgb_2018)
probability_bbc_xgb_2018 = probability_bbc_xgb_2018.loc[:,1:]
probability_bbc_xgb_2018.columns = ['Predicted MVP Probability']

predictions_bbc_xgb_2018 = pd.DataFrame(predictions_bbc_xgb_2018)
predictions_bbc_xgb_2018.columns = ['Predicted MVP']

real_mvp = pd.DataFrame(test_2018_y)
real_mvp.columns = ['Real MVP']
real_mvp.reset_index(inplace=True)
real_mvp = real_mvp[['Real MVP']]

predictors_bbc_xgb_2018 = test_2018_X.copy(deep=True)
predictors_bbc_xgb_2018.reset_index(inplace=True)
predictors_bbc_xgb_2018.columns = ['Player', 'WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']

In [471]:
MVP_2018_bbc_xgb = pd.concat([predictors_bbc_xgb_2018, real_mvp, predictions_bbc_xgb_2018, probability_bbc_xgb_2018], axis=1)
MVP_2018_bbc_xgb.head()

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
0,39,0.1,0.34,0.88,0.508,0.63,-0.1,0.0,2.34,0.53,14.371683,9.8,0,0,0.007753
1,43,2.9,1.84,1.91,0.53,2.34,1.0,0.78,17.62,7.88,173.70639,16.5,0,0,0.164629
2,54,-0.1,0.33,3.0,0.392,1.22,-0.1,0.22,6.67,2.67,1.981921,5.1,0,0,0.005795
3,56,0.0,1.0,4.0,0.405,1.0,-0.1,0.0,8.0,3.0,0.079186,2.4,0,0,0.006538
4,88,-0.1,0.71,0.94,0.439,0.54,-0.5,0.21,3.04,1.48,59.870854,5.1,0,0,0.007451


In [506]:
# Checking for who was predicted as MVP by the model
MVP_2018_bbc_xgb[MVP_2018_bbc_xgb['Predicted MVP'] == 1].sort_values(by=['Predicted MVP Probability'], ascending=False).head(36)

Unnamed: 0,Player,WS,TOV/G,PF/G,TS%,AST/G,VORP,BLK/G,PPG,TRB/G,Impact,PER,Real MVP,Predicted MVP,Predicted MVP Probability
444,14231,10.1,4.76,2.5,0.524,10.25,7.5,0.25,25.35,10.05,969.438049,24.7,1,1,0.992021
324,10170,14.0,4.23,1.66,0.621,9.11,8.9,0.87,27.45,8.65,996.058333,28.6,1,1,0.99123
299,9401,10.4,3.04,1.96,0.64,5.38,4.5,1.75,26.35,6.82,708.236585,26.0,1,1,0.98981
217,7149,15.4,4.38,2.35,0.619,8.75,8.3,0.69,30.43,5.4,1094.98564,29.8,1,1,0.988605
173,6064,11.9,2.97,3.08,0.598,4.81,5.4,1.41,26.85,10.04,720.929268,27.3,1,1,0.986134
86,3451,12.6,2.82,1.6,0.594,6.59,5.9,0.37,26.88,4.45,742.500137,25.2,1,1,0.980997
513,16542,8.2,2.92,2.33,0.577,4.31,4.5,0.76,23.13,5.2,702.578049,23.1,1,1,0.976609
29,821,13.7,2.16,2.12,0.612,2.32,4.9,2.57,28.13,11.09,748.262195,28.9,1,1,0.971383
464,15100,9.1,3.0,2.24,0.675,6.08,4.4,0.16,26.39,5.12,379.978247,28.2,1,1,0.970744
396,12328,10.7,2.8,2.83,0.603,6.11,5.6,0.81,18.47,10.71,518.206479,24.4,0,1,0.968242


# 2018 MVPs as predicted by Logistic Regression with SMOTE


# Elite Candidates where P => 0.9 
1. Russell Westbrook (0.992)
2. LeBron James (0.991)
3. Kevin Durant (0.989)
4. James Harden (0.988)
5. Giannis Antetokounmpo (0.986)
6. Damian Lillard (0.980)
7. Victor Oladipo (0.976)
8. Anthony Davis (0.971)
9. Stephen Curry (0.970)
10. Nikola Jokic (0.968)
11. DeMar DeRozan (0.962)
12. Kemba Walker (0.955)
13. Karl-Anthony Towns (0.953)
14. Lou Williams (0.947)
15. Joel Embiid (0.939)
16. LaMarcus Aldridge (0.936)
17. Chris Paul (0.932)
18. Bradley Beal (0.930)
19. Ben Simmons (0.925)
20. Kyle Lowry (0.919)
21. Jimmy Butler (0.909)
22. Kyrie Irving (0.904)


# Mid-tier Candidates  (0.6 <= P <= 0.89)
23. Paul George (0.836)
24. Jrue Holiday (0.833)
25. Andre Drummond (0.791)
26. DeMarcus Cousins (0.787)
27. Khris Middleton (0.744)
28. Tyreke Evans (0.734)
29. John Wall (0.729)
30. Dwight Howard (0.665)


# Pleb-tier Candidates (0.6 < P)
31. Jusuf Nurkic (0.562)
32. CJ McCollum (0.550)
33. Will Barton (0.540)
34. Blake Griffin (0.529)
35. Steven Adams (0.528)
36. Julius Randle (0.513)
