In [66]:
import pandas as pd

In [67]:
# Custom models
import evaluation

In [68]:
imbalanced_results = pd.read_csv("results.csv")
oversampling_results = pd.read_csv("results_oversampling.csv")
undersampling_results = pd.read_csv("results_undersampling.csv")

imbalanced_results["data"] = "Imbalanced"
oversampling_results["data"] = "oversampling"
undersampling_results["data"] = "undersampling"

all_results = imbalanced_results.append(undersampling_results, ignore_index=True)
all_results = all_results.append(oversampling_results, ignore_index=True)

all_results["f1_at_1000"] = 2 * ((all_results["p_at_1000"] * all_results["rec_at_1000"]) 
                             / (all_results["p_at_1000"] + all_results["rec_at_1000"]))

all_results["f1_at_2000"] = 2 * ((all_results["p_at_2000"] * all_results["rec_at_2000"]) 
                             / (all_results["p_at_2000"] + all_results["rec_at_2000"]))

# Which model from which dataset maximizes auc-roc?

In [80]:
# Max auc-roc by model
all_results.sort_values("auc-roc", ascending=False)[["data", "model_type", "clf", "parameters", 
                                           "train_time", "predict_time", "auc-roc"]].drop_duplicates().head(50)

Unnamed: 0,data,model_type,clf,parameters,train_time,predict_time,auc-roc
2354,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.1, 's...",20.698193,0.07229,0.867293
2336,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.1, 's...",18.196204,0.07922,0.865196
1969,Imbalanced,AB,"AdaBoostClassifier(algorithm='SAMME.R',\n ...","{'algorithm': 'SAMME.R', 'n_estimators': 100}",6.123125,0.30734,0.861385
2523,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 10, 'learning_rate': 0.5, 'su...",2.854698,0.014692,0.860898
88,Imbalanced,RF,"RandomForestClassifier(bootstrap=True, class_w...","{'min_samples_split': 10, 'n_estimators': 100,...",2.252153,0.122245,0.860759
227,Imbalanced,RF,"RandomForestClassifier(bootstrap=True, class_w...","{'min_samples_split': 10, 'n_estimators': 100,...",1.549831,0.125107,0.860704
2079,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.001, ...",14.136692,0.108299,0.860604
38,Imbalanced,RF,"RandomForestClassifier(bootstrap=True, class_w...","{'min_samples_split': 2, 'n_estimators': 100, ...",1.751771,0.124931,0.860467
96,Imbalanced,RF,"RandomForestClassifier(bootstrap=True, class_w...","{'min_samples_split': 10, 'n_estimators': 100,...",2.454828,0.120252,0.860438
2275,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 10, 'learning_rate': 0.1, 'su...",2.319708,0.016018,0.860352


In [70]:
all_results.sort_values("auc-roc", ascending=False)[["data", "clf", "parameters"]].drop_duplicates().iloc[0].clf

"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n              learning_rate=0.5, loss='deviance', max_depth=50,\n              max_features=None, max_leaf_nodes=None,\n              min_impurity_split=1e-07, min_samples_leaf=1,\n              min_samples_split=2, min_weight_fraction_leaf=0.0,\n              n_estimators=100, presort='auto', random_state=None,\n              subsample=1.0, verbose=0, warm_start=False)"

# Which model maximizes f1 score at 1000?

In [71]:
# Which model & threshold gives us the best f1 score at 1000?
all_results.sort_values("f1_at_1000", ascending=False)[["data", "model_type", "clf", "parameters", 
                                                       "train_time", "predict_time", "threshold", "f1_at_1000",
                                                       "p_at_1000", "p_at_2000", "rec_at_1000", 
                                                       "rec_at_2000"]].head(10)

Unnamed: 0,data,model_type,clf,parameters,train_time,predict_time,threshold,f1_at_1000,p_at_1000,p_at_2000,rec_at_1000,rec_at_2000
9622,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'adaptive',...",19.980878,0.043651,0.5,1.0,1.0,0.9385,1.0,1.0
4928,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.45,1.0,1.0,0.597,1.0,1.0
4936,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.85,1.0,1.0,0.597,1.0,1.0
4935,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.8,1.0,1.0,0.597,1.0,1.0
4934,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.75,1.0,1.0,0.597,1.0,1.0
4933,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.7,1.0,1.0,0.597,1.0,1.0
4932,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.65,1.0,1.0,0.597,1.0,1.0
4931,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.6,1.0,1.0,0.597,1.0,1.0
4930,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.55,1.0,1.0,0.597,1.0,1.0
4929,undersampling,KNN,"KNeighborsClassifier(algorithm='kd_tree', leaf...","{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.011912,3.713394,0.5,1.0,1.0,0.597,1.0,1.0


In [72]:
all_results.sort_values("f1_at_1000", ascending=False)[["data", "clf", "parameters"]].drop_duplicates().iloc[0].clf

"MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,\n       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n       hidden_layer_sizes=(100,), learning_rate='adaptive',\n       learning_rate_init=0.1, max_iter=200, momentum=0.9,\n       nesterovs_momentum=True, power_t=0.5, random_state=None,\n       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,\n       verbose=False, warm_start=False)"

# Which model maximizes f1 score at 2000?

In [73]:
# Which model & threshold gives us the best f1 score at 2000?
all_results.sort_values("f1_at_2000", ascending=False)[["data","model_type", "clf", "parameters", 
                                                       "train_time", "predict_time", "threshold", "f1_at_2000",
                                                       "p_at_2000", "rec_at_2000"]].head(10)

Unnamed: 0,data,model_type,clf,parameters,train_time,predict_time,threshold,f1_at_2000,p_at_2000,rec_at_2000
9501,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.15,0.980892,0.9625,1.0
9502,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.2,0.980892,0.9625,1.0
9503,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.25,0.980892,0.9625,1.0
9504,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.3,0.980892,0.9625,1.0
9505,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.35,0.980892,0.9625,1.0
9506,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.4,0.980892,0.9625,1.0
9507,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.45,0.980892,0.9625,1.0
9508,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.5,0.980892,0.9625,1.0
9500,oversampling,NN,"MLPClassifier(activation='relu', alpha=0.1, ba...","{'solver': 'sgd', 'learning_rate': 'constant',...",12.426291,0.036052,0.1,0.980892,0.9625,1.0
7205,oversampling,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_features': 'sqrt', ...",0.034088,0.006964,0.3,0.97855,0.958,1.0


In [74]:
all_results.sort_values("f1_at_2000", ascending=False)[["data", "clf", "parameters"]].drop_duplicates().iloc[0].clf

"MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,\n       beta_2=0.999, early_stopping=False, epsilon=1e-08,\n       hidden_layer_sizes=(100,), learning_rate='adaptive',\n       learning_rate_init=0.1, max_iter=200, momentum=0.9,\n       nesterovs_momentum=True, power_t=0.5, random_state=None,\n       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,\n       verbose=False, warm_start=False)"

In [75]:
all_results.sort_values("f1_at_2000", ascending=False)[["data", "clf", "parameters"]].drop_duplicates().iloc[1].clf

"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,\n            max_features='log2', max_leaf_nodes=None,\n            min_impurity_split=1e-07, min_samples_leaf=1,\n            min_samples_split=10, min_weight_fraction_leaf=0.0,\n            presort=False, random_state=None, splitter='best')"

In [81]:
all_results.iloc[9501]

model_type                                                     NN
clf             MLPClassifier(activation='relu', alpha=0.1, ba...
parameters      {'solver': 'sgd', 'learning_rate': 'constant',...
train_time                                                12.4263
predict_time                                             0.036052
threshold                                                    0.15
auc-roc                                                  0.560933
p_at_1                                                          1
p_at_20                                                         1
p_at_50                                                         1
p_at_100                                                        1
p_at_150                                                        1
p_at_200                                                        1
p_at_500                                                        1
p_at_1000                                                       1
p_at_2000 

In [76]:
all_results.iloc[4333]

model_type                                                     DT
clf             DecisionTreeClassifier(class_weight=None, crit...
parameters      {'criterion': 'gini', 'max_features': 'log2', ...
train_time                                             0.00177312
predict_time                                           0.00566387
threshold                                                    0.15
auc-roc                                                  0.515911
p_at_1                                                          1
p_at_20                                                         1
p_at_50                                                         1
p_at_100                                                        1
p_at_150                                                        1
p_at_200                                                        1
p_at_500                                                        1
p_at_1000                                                       1
p_at_2000 

In [77]:
all_results.iloc[3762]

model_type                                                     NN
clf             MLPClassifier(activation='relu', alpha=0.1, ba...
parameters      {'activation': 'relu', 'alpha': 0.1, 'learning...
train_time                                                24.2991
predict_time                                            0.0395038
threshold                                                     0.1
auc-roc                                                  0.704976
p_at_1                                                          1
p_at_20                                                      0.45
p_at_50                                                       0.4
p_at_100                                                     0.33
p_at_150                                                     0.28
p_at_200                                                     0.27
p_at_500                                                    0.192
p_at_1000                                                    0.51
p_at_2000 

# Time is Money...

In [78]:
# How long did it take to train the models in sec?
all_results.sort_values("train_time", ascending=False)[["data","model_type", "clf", "parameters", 
                                           "train_time", "predict_time", "auc-roc"]].drop_duplicates()

Unnamed: 0,data,model_type,clf,parameters,train_time,predict_time,auc-roc
2213,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.001, ...",1975.834438,0.610510,0.832422
8704,oversampling,AB,"AdaBoostClassifier(algorithm='SAMME.R',\n ...","{'n_estimators': 10000, 'algorithm': 'SAMME.R'}",1564.039607,31.578097,0.845949
8608,oversampling,AB,"AdaBoostClassifier(algorithm='SAMME.R',\n ...","{'n_estimators': 10000, 'algorithm': 'SAMME'}",1479.859586,17.727232,0.851498
2440,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.1, 's...",1372.701670,0.533428,0.836818
2232,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.001, ...",1148.768571,0.463050,0.644936
2461,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.1, 's...",703.460013,0.386249,0.831913
2013,Imbalanced,AB,"AdaBoostClassifier(algorithm='SAMME.R',\n ...","{'algorithm': 'SAMME.R', 'n_estimators': 10000}",589.627495,32.327498,0.846693
2203,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.001, ...",550.745391,0.455056,0.848767
1904,Imbalanced,AB,"AdaBoostClassifier(algorithm='SAMME.R',\n ...","{'algorithm': 'SAMME', 'n_estimators': 10000}",532.266432,20.243453,0.854007
2428,Imbalanced,GB,GradientBoostingClassifier(criterion='friedman...,"{'n_estimators': 100, 'learning_rate': 0.1, 's...",484.534586,0.511159,0.546483


In [79]:
# We are only evaluating the time to train and predict on each data set individually
# since the number of training examples were different in each
# time in min

print("----Imbalanced Data-----")
evaluation.print_mean_max(imbalanced_results, "model_type", "train_time")
print()
evaluation.print_mean_max(imbalanced_results, "model_type", "predict_time")

print("\n\n----Undersampled Data-----\n")
evaluation.print_mean_max(undersampling_results, "model_type", "train_time")
print()
evaluation.print_mean_max(undersampling_results, "model_type", "predict_time")

print("\n\n----Oversampled Data-----\n")
evaluation.print_mean_max(oversampling_results, "model_type", "train_time")
print()
evaluation.print_mean_max(oversampling_results, "model_type", "predict_time")

----Imbalanced Data-----
Mean values for train_time by model_type is:
model_type
GB     3.674252
AB     2.092771
NN     0.380912
RF     0.035437
LR     0.034066
KNN    0.005090
DT     0.001760
NB     0.001162
Name: train_time, dtype: float64

Max values for train_time by model_type is:
model_type
GB     32.930574
AB      9.827125
NN      0.946762
RF      0.076133
LR      0.042702
KNN     0.007146
DT      0.003748
NB      0.001162
Name: train_time, dtype: float64

Mean values for predict_time by model_type is:
model_type
KNN    1.350624
AB     0.097694
RF     0.002920
GB     0.002409
NN     0.001089
NB     0.000566
DT     0.000176
LR     0.000126
Name: predict_time, dtype: float64

Max values for predict_time by model_type is:
model_type
KNN    3.399300
AB     0.538792
GB     0.010175
RF     0.005476
NN     0.001700
NB     0.000566
DT     0.000320
LR     0.000145
Name: predict_time, dtype: float64


----Undersampled Data-----

Mean values for train_time by model_type is:
model_type
AB  