In [684]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, SCORERS
import matplotlib as plt
%matplotlib inline
from matplotlib import rcParams
import pandas as pd
import numpy as np
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

In [427]:
# Loading in the dataframe, then removing the useless column
stats = pd.read_csv('stats_1.6.csv')
stats.drop(labels='Unnamed: 0', axis=1, inplace=True)
stats.head()

Unnamed: 0,Player,MVP,Year,Age,TmWin,G,GS,MP,PER,TS%,...,ORB/G,DRB/G,TRB/G,AST/G,STL/G,BLK/G,TOV/G,PF/G,PPG,Impact
0,A.C. Green,0,1986,22.0,62.0,82.0,1.0,1542.0,11.8,0.564,...,1.95,2.7,4.65,0.66,0.6,0.6,1.21,2.79,6.35,292.78725
1,A.C. Green,0,1987,23.0,65.0,79.0,72.0,2240.0,15.7,0.599,...,2.66,5.13,7.78,1.06,0.89,1.01,1.29,2.16,10.78,429.586585
2,A.C. Green,0,1988,24.0,62.0,82.0,64.0,2636.0,14.5,0.581,...,2.99,5.67,8.66,1.13,1.06,0.55,1.46,2.49,11.43,500.5105
3,A.C. Green,0,1989,25.0,57.0,82.0,82.0,2510.0,17.8,0.594,...,3.15,5.87,9.01,1.26,1.15,0.67,1.45,2.1,13.27,506.70625
4,A.C. Green,0,1990,26.0,63.0,82.0,82.0,2709.0,14.7,0.548,...,3.2,5.49,8.68,1.1,0.8,0.61,1.41,2.52,12.94,608.001188


In [428]:
# Preparing the predictors for Logistic Regression
# Naming the predictors and target variables X1, y1
X1 = stats[['WS', 'TOV/G', 'PF/G', 'TS%', 'AST/G', 'VORP', 'BLK/G', 'PPG', 'TRB/G', 'Impact', 'PER']]
y1 = stats['MVP']

In [429]:
# Scaling the predictors using StandardScaler()
ss = StandardScaler()
ss.fit_transform(X1)

# Creating a training and testing set
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

Part 1a. Testing with base Random Forest Classifier

In [432]:
rfc = RandomForestClassifier()

# Fitting the model
rfc1 = rfc.fit(X1_train, y1_train)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfc1, X1_train, y1_train, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfc1, X1_train, y1_train, cv=5)))

# Constructing the confusion matrix
predictions_rfc1 = rfc1.predict(X1_test)
predictions_proba_rfc1 = rfc1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfc1))
print(classification_report(y1_test, predictions_rfc1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfc1[:,1]))

# Results are pretty awful for predicting the minority class



Cross-Validation Scores: [0.97740525 0.97849854 0.97485423 0.97885527 0.9795844 ]
Mean Cross-Validation Score: 0.9786410731158772
[[3344   15]
 [  54   17]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3359
           1       0.53      0.24      0.33        71

   micro avg       0.98      0.98      0.98      3430
   macro avg       0.76      0.62      0.66      3430
weighted avg       0.97      0.98      0.98      3430

AUC Score 0.9026516946274251


Part 1b. Testing with some manually inputed parameters for Random Forest Classifier

In [435]:
rfc2 = RandomForestClassifier(n_estimators=10, criterion='entropy', max_features=2, n_jobs=-1, random_state=0)

# Fitting the model
rfct2 = rfc2.fit(X1_train, y1_train)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfct2, X1_train, y1_train, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfct2, X1_train, y1_train, cv=5)))

# Constructing the confusion matrix
predictions_rfct2 = rfct2.predict(X1_test)
predictions_proba_rfct2 = rfct2.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfct2))
print(classification_report(y1_test, predictions_rfct2))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfct2[:,1]))

# Results are still pretty awful for predicting the minority class
# Tested both 'gini'/'entropy' entropy can detect 1 more minority class

Cross-Validation Scores: [0.9803207  0.97849854 0.97886297 0.98031353 0.9795844 ]
Mean Cross-Validation Score: 0.9795160275453341
[[3348   11]
 [  53   18]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3359
           1       0.62      0.25      0.36        71

   micro avg       0.98      0.98      0.98      3430
   macro avg       0.80      0.63      0.68      3430
weighted avg       0.98      0.98      0.98      3430

AUC Score 0.912314614091216


Part 1c. Trying SMOTE with Random Forest Classifier

In [504]:
# Oversampling the data with SMOTE
sm = SMOTE(sampling_strategy=0.6, random_state=7, k_neighbors=9)
smote_X1, smote_y1 = sm.fit_sample(X1_train, y1_train)

In [505]:
rfc3 = RandomForestClassifier(n_estimators=10, criterion='entropy', max_features=2, n_jobs=-1, random_state=0)

# Fitting the model
rfct3 = rfc3.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfct3, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfct3, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_rfct3 = rfct3.predict(X1_test)
predictions_proba_rfct3 = rfct3.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfct3))
print(classification_report(y1_test, predictions_rfct3))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfct3[:,1]))

Cross-Validation Scores: [0.96731263 0.97104833 0.96777959 0.96870621 0.96986685]
Mean Cross-Validation Score: 0.9689427242549208
[[3281   78]
 [  20   51]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3359
           1       0.40      0.72      0.51        71

   micro avg       0.97      0.97      0.97      3430
   macro avg       0.69      0.85      0.75      3430
weighted avg       0.98      0.97      0.98      3430

AUC Score 0.9532095819932995


In [509]:
# Lets try to optimzie with GridSearch
rfc3_params = {'n_estimators':[3,6,9,10,11], 'criterion':['gini', 'entropy'],
              'max_depth':[1,3,5,7,9,11], 'max_features':[1,3,5,7,9]}

rfc3_gridsearch = GridSearchCV(rfc, rfc3_params, cv=5, verbose=1, scoring='recall', n_jobs=-1)

rfc3_gridsearch.fit(smote_X1, smote_y1)

print(rfc3_gridsearch.best_params_)
print(rfc3_gridsearch.best_score_)
pp.pprint(rfc3_gridsearch.cv_results_)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 313 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 662 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 1012 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 1462 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  1.4min finished


{'criterion': 'entropy', 'max_depth': 11, 'max_features': 7, 'n_estimators': 11}
0.9865488128648268
{'mean_fit_time': array([0.02792511, 0.0482707 , 0.06343184, 0.0710084 , 0.07340474,
       0.03610487, 0.06343174, 0.08956051, 0.09773893, 0.11449313,
       0.05126333, 0.08397527, 0.11050491, 0.12326994, 0.13543787,
       0.05126305, 0.10013213, 0.13503895, 0.1490015 , 0.16894755,
       0.05944142, 0.11050553, 0.16416311, 0.18191428, 0.20026412,
       0.03470712, 0.06283121, 0.09155459, 0.10172858, 0.10831099,
       0.05904236, 0.11628795, 0.1531899 , 0.17552977, 0.19587545,
       0.08098383, 0.15259223, 0.22460003, 0.25850778, 0.28543577,
       0.10571694, 0.197472  , 0.30498433, 0.34068837, 0.3528553 ,
       0.1308495 , 0.25152688, 0.37499585, 0.40890679, 0.46615334,
       0.03969388, 0.07480001, 0.1140944 , 0.12287145, 0.13304467,
       0.08856297, 0.15658092, 0.22818961, 0.2463407 , 0.27985172,
       0.11688681, 0.22519684, 0.33370624, 0.38297544, 0.40351954,
       0.16

             'max_depth': 5,
             'max_features': 3,
             'n_estimators': 10},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 3,
             'n_estimators': 11},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 5,
             'n_estimators': 3},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 5,
             'n_estimators': 6},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 5,
             'n_estimators': 9},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 5,
             'n_estimators': 10},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 5,
             'n_estimators': 11},
            {'criterion': 'entropy',
             'max_depth': 5,
             'max_features': 7,
             'n_es

In [510]:
rfc4 = RandomForestClassifier(n_estimators=11, criterion='entropy', max_features=7, n_jobs=-1, random_state=0, max_depth=11)

# Fitting the model
rfct4 = rfc4.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfct4, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfct4, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_rfct4 = rfct4.predict(X1_test)
predictions_proba_rfct4 = rfct4.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfct4))
print(classification_report(y1_test, predictions_rfct4))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfct4[:,1]))

Cross-Validation Scores: [0.96100864 0.96848004 0.9603082  0.96707146 0.96566223]
Mean Cross-Validation Score: 0.9645061123473656
[[3225  134]
 [  16   55]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3359
           1       0.29      0.77      0.42        71

   micro avg       0.96      0.96      0.96      3430
   macro avg       0.64      0.87      0.70      3430
weighted avg       0.98      0.96      0.97      3430

AUC Score 0.9663841938202601


In [511]:
# Lets try to optimzie with GridSearch

rfc5_params = {'n_estimators':[9,10,11,12,13,14,15,16,17,18,19,20],
              'max_depth':[7,9,11,12,13,14,15,16,17,18,19,20]}

rfc5_gridsearch = GridSearchCV(RandomForestClassifier(criterion='entropy', max_features=7, n_jobs=-1,
                                                      random_state=0), rfc5_params, cv=5, verbose=1, scoring='recall', n_jobs=-1)

rfc5_gridsearch.fit(smote_X1, smote_y1)

print(rfc5_gridsearch.best_params_)
print(rfc5_gridsearch.best_score_)
pp.pprint(rfc5_gridsearch.cv_results_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  3.1min finished


{'max_depth': 13, 'n_estimators': 17}
0.9885416006496551
{'mean_fit_time': array([0.60138202, 0.70600429, 0.82060242, 0.63868613, 0.91475863,
       0.841748  , 0.6754971 , 1.00594077, 0.89739842, 0.95375876,
       1.23589401, 0.80096312, 0.87772279, 0.74081736, 0.62822428,
       0.81239672, 0.74141588, 0.72695684, 1.16811666, 0.82634506,
       1.19078999, 1.31264315, 1.33188739, 1.3484478 , 0.91096234,
       0.764747  , 0.78569841, 0.84339166, 0.92307873, 1.14871778,
       1.22745004, 1.02247767, 1.3762382 , 1.26767969, 1.22452431,
       1.19340739, 0.78190827, 0.71042671, 0.74420948, 0.90657473,
       0.94950237, 0.97068748, 1.27919664, 1.07193165, 1.39245706,
       1.7529099 , 1.45750136, 1.33841996, 1.06096191, 0.899194  ,
       0.98524771, 1.04300966, 1.00072427, 1.06155963, 1.15630693,
       1.43176894, 1.31029506, 1.56800556, 1.41361957, 1.40125151,
       0.90178795, 0.86588383, 0.92233338, 0.89121537, 1.08270321,
       0.98835602, 1.10464468, 1.20098729, 1.38848572,

In [512]:
rfc5 = RandomForestClassifier(n_estimators=17, criterion='entropy', max_features=7, n_jobs=-1, random_state=0, max_depth=13)

# Fitting the model
rfct5 = rfc5.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(rfct5, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(rfct5, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_rfct5 = rfct5.predict(X1_test)
predictions_proba_rfct5 = rfct5.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_rfct5))
print(classification_report(y1_test, predictions_rfct5))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_rfct5[:,1]))

Cross-Validation Scores: [0.96661219 0.97174877 0.96777959 0.97174218 0.9686989 ]
Mean Cross-Validation Score: 0.9693163268727902
[[3247  112]
 [  18   53]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3359
           1       0.32      0.75      0.45        71

   micro avg       0.96      0.96      0.96      3430
   macro avg       0.66      0.86      0.71      3430
weighted avg       0.98      0.96      0.97      3430

AUC Score 0.9519663380700997


# END
Experiment with random forest ends here. Conclusion is it performs even worse than Logistic Regression with SMOTE

I wanna try Logistic Regression with some SMOTE tinkering before I conclude logistic regression.

In [678]:
# Oversampling the data with SMOTE
sm = SMOTE(sampling_strategy=0.6, random_state=7, k_neighbors=8)
smote_X1, smote_y1 = sm.fit_sample(X1_train, y1_train)

In [679]:
# Specifying the optimal hyperparameters
log1 = LogisticRegression(penalty='l2', C=0.2)

# Fitting the model
logt1 = log1.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(logt1, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(logt1, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_logt1 = logt1.predict(X1_test)
predictions_proba_logt1 = logt1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_logt1))
print(classification_report(y1_test, predictions_logt1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_logt1[:,1]))



Cross-Validation Scores: [0.92108335 0.9266869  0.92411861 0.93017282 0.92128007]




Mean Cross-Validation Score: 0.9246683508272676
[[3154  205]
 [  10   61]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      3359
           1       0.23      0.86      0.36        71

   micro avg       0.94      0.94      0.94      3430
   macro avg       0.61      0.90      0.66      3430
weighted avg       0.98      0.94      0.95      3430

AUC Score 0.9673863364767348


In [680]:
bbc = BalancedBaggingClassifier(base_estimator=LogisticRegression(penalty='l2', C=0.2))

bbc1_params = {'n_estimators':[1,3,5,7,9,11], 'max_samples':[500,700,900,1100,1300],
              'max_features':[1,3,5,7,9,11], 'bootstrap':['True', 'False'],
              'replacement':['True', 'False'], 
              'sampling_strategy':['majority', 'not minority', 'all', 'auto', 'not majority']}

bbc1_gridsearch = GridSearchCV(bbc, bbc1_params, cv=5, verbose=1, scoring='recall', n_jobs=-1)

bbc1_gridsearch.fit(smote_X1, smote_y1)

print(bbc1_gridsearch.best_params_)
print(bbc1_gridsearch.best_score_)
pp.pprint(bbc1_gridsearch.cv_results_)

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    1.0s


KeyboardInterrupt: 

In [681]:
bbc2 = BalancedBaggingClassifier(base_estimator=LogisticRegression(penalty='l2', C=0.2), bootstrap=False,
                                 max_features=1, max_samples=500, n_estimators=9, replacement=True,
                                 sampling_strategy='majority', random_state=0)

# Fitting the model
bbc2 = bbc2.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(bbc2, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(bbc2, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_bbc2 = bbc2.predict(X1_test)
predictions_proba_bbc2 = bbc2.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_bbc2))
print(classification_report(y1_test, predictions_bbc2))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_bbc2[:,1]))



Cross-Validation Scores: [0.88209199 0.89259865 0.89003035 0.89070528 0.88764307]
Mean Cross-Validation Score: 0.8886138683831895
[[2804  555]
 [   3   68]]
              precision    recall  f1-score   support

           0       1.00      0.83      0.91      3359
           1       0.11      0.96      0.20        71

   micro avg       0.84      0.84      0.84      3430
   macro avg       0.55      0.90      0.55      3430
weighted avg       0.98      0.84      0.89      3430

AUC Score 0.9614950794376259




In [687]:
# Optimizing for roc_auc
bc1 = BaggingClassifier(base_estimator=LogisticRegression(penalty='l2', C=0.2), n_jobs=-1)

bc1_params = {'n_estimators':[1,3,5,7,9,11], 'max_samples':[500,700,900,1100,1300],
              'max_features':[1,3,5,7,9,11]}

bc1_gridsearch = GridSearchCV(bc1, bc1_params, cv=5, verbose=1, scoring='recall')

bc1_gridsearch.fit(smote_X1, smote_y1)

print(bc1_gridsearch.best_params_)
print(bc1_gridsearch.best_score_)
pp.pprint(bc1_gridsearch.cv_results_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.






[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed:  4.5min finished


{'max_features': 3, 'max_samples': 500, 'n_estimators': 7}
0.9245215521303327
{'mean_fit_time': array([0.01297593, 0.09909425, 0.06201358, 0.26459398, 0.24459038,
       0.06062608, 0.01115108, 0.11558266, 0.23583269, 0.28156009,
       0.2401794 , 0.07909021, 0.01077857, 0.132723  , 0.25717335,
       0.25640135, 0.24466658, 0.05926943, 0.01136017, 0.11204786,
       0.25628514, 0.2632247 , 0.24612594, 0.06260533, 0.01216927,
       0.11512589, 0.25472636, 0.26225452, 0.24551535, 0.06281366,
       0.01873837, 0.12982554, 0.28662448, 0.30810084, 0.27911448,
       0.10131569, 0.01955786, 0.12465582, 0.27829952, 0.284551  ,
       0.10723   , 0.09795108, 0.01916256, 0.12565908, 0.27913957,
       0.28761411, 0.28418374, 0.09795847, 0.01980352, 0.14258709,
       0.0852829 , 0.291675  , 0.28292184, 0.11197886, 0.02244968,
       0.12398996, 0.28638954, 0.29269652, 0.28583283, 0.11317201,
       0.02430782, 0.13976445, 0.1063293 , 0.12878213, 0.33209662,
       0.1760004 , 0.02673659, 0.

In [696]:
# Oversampling the data with SMOTE
sm = SMOTE(sampling_strategy='auto', random_state=7)
smote_X1, smote_y1 = sm.fit_sample(X1_train, y1_train)

In [697]:
bc1 = BalancedBaggingClassifier(base_estimator=LogisticRegression(penalty='l2', C=0.2),
                                 max_features=3, max_samples=500, n_estimators=7, random_state=0)

# Fitting the model
bc1 = bc1.fit(smote_X1, smote_y1)


# Checking cross-validation values
print('Cross-Validation Scores:', cross_val_score(bc1, smote_X1, smote_y1, cv=5))
print('Mean Cross-Validation Score:', np.mean(cross_val_score(bc1, smote_X1, smote_y1, cv=5)))

# Constructing the confusion matrix
predictions_bc1 = bc1.predict(X1_test)
predictions_proba_bc1 = bc1.predict_proba(X1_test)
print(confusion_matrix(y1_test, predictions_bc1))
print(classification_report(y1_test, predictions_bc1))
print('AUC Score', roc_auc_score(y1_test, predictions_proba_bc1[:,1]))



Cross-Validation Scores: [0.91781845 0.92304819 0.92379529 0.92769058 0.92488789]
Mean Cross-Validation Score: 0.9234480820676382
[[2924  435]
 [   4   67]]
              precision    recall  f1-score   support

           0       1.00      0.87      0.93      3359
           1       0.13      0.94      0.23        71

   micro avg       0.87      0.87      0.87      3430
   macro avg       0.57      0.91      0.58      3430
weighted avg       0.98      0.87      0.92      3430

AUC Score 0.9602916696367548


