In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml
from preprocessing import separate_features_target

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = {
    'stylometric' : ['style_train_balanced.csv','style_train_imbalanced.csv'],
    'word2vec' : ['word2vec_train_balanced.csv','word2vec_train_imbalanced.csv']
}
test = {
    'stylometric' : ['style_test_balanced.csv','style_test_imbalanced.csv'],
    'word2vec' : ['word2vec_test_balanced.csv','word2vec_test_imbalanced.csv']
}

## Import Data

Since Word2Vec features outperformed the TF-IDF features, only those will be used to test the combination with content features.

### Balanced Dataset

In [3]:
style_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
style_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

word2vec_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
word2vec_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

In [4]:
style_train_balanced = separate_features_target(style_train_balanced_complete)
style_test_balanced = separate_features_target(style_test_balanced_complete)

word2vec_train_balanced = separate_features_target(word2vec_train_balanced_complete)
word2vec_test_balanced = separate_features_target(word2vec_test_balanced_complete)

### Imbalanced Dataset

In [5]:
style_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
style_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

word2vec_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
word2vec_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

In [6]:
style_train_imbalanced = separate_features_target(style_train_imbalanced_complete)
style_test_imbalanced = separate_features_target(style_test_imbalanced_complete)

word2vec_train_imbalanced = separate_features_target(word2vec_train_imbalanced_complete)
word2vec_test_imbalanced = separate_features_target(word2vec_test_imbalanced_complete)

# Merging Feature Sets

The simplest way of combining the information of the two different feature sets is to simply merge them into one set and then perform the predictions based on this concatenated set.

## Balanced Dataset

In [7]:
style_content_train_balanced = pd.concat([word2vec_train_balanced['features'], style_train_balanced['features']], axis=1)
style_content_test_balanced = pd.concat([word2vec_test_balanced['features'], style_test_balanced['features']], axis=1)

### Train

#### Logistic Regression

In [8]:
%%time
lr_style_content_balanced = ml.train_logistic_regression(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)
lr_style_content_balanced, lr_style_content_balanced_scaler = lr_style_content_balanced['model'], lr_style_content_balanced['scaler']

Train accuracy: 0.9988864142538976
CPU times: user 460 ms, sys: 645 ms, total: 1.1 s
Wall time: 78.2 ms


#### Decision Tree

In [9]:
%%time
dt_style_content_balanced = ml.train_decision_tree(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.985894580549369
CPU times: user 443 ms, sys: 858 ms, total: 1.3 s
Wall time: 156 ms


#### Random Forest

In [10]:
%%time
rf_style_content_balanced = ml.train_random_forest(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9855233853006682
CPU times: user 150 ms, sys: 2.05 ms, total: 153 ms
Wall time: 151 ms


#### Gradient Boosting

In [11]:
%%time
gb_style_content_balanced = ml.train_gradient_boost(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9977728285077951
CPU times: user 7.03 s, sys: 72 ms, total: 7.1 s
Wall time: 7.1 s


#### Naive Bayes

In [12]:
%%time
nb_style_content_balanced = ml.train_naive_bayes(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1, remove_negatives=True)
nb_style_content_balanced, nb_style_content_balanced_scaler = nb_style_content_balanced['model'], nb_style_content_balanced['scaler']

Train accuracy: 0.9665924276169265
CPU times: user 73.9 ms, sys: 117 ms, total: 191 ms
Wall time: 20 ms


### Results

In [13]:
models = [lr_style_content_balanced, dt_style_content_balanced, rf_style_content_balanced, gb_style_content_balanced, nb_style_content_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [14]:
%%time
results_style_content_balanced = ml.multi_model_results(models, names, style_content_test_balanced, style_test_balanced['target'], lr_style_content_balanced_scaler, nb_style_content_balanced_scaler)

CPU times: user 312 ms, sys: 571 ms, total: 884 ms
Wall time: 55.2 ms


In [15]:
results_style_content_balanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.964392,0.962857,0.968391,0.965616,0.039877,0.031609,0.980771
Decision Tree,0.951039,0.959184,0.945402,0.952243,0.042945,0.054598,0.936535
Random Forest,0.973294,0.971429,0.977011,0.974212,0.030675,0.022989,0.995231
Gradient Boosting Tree,0.979228,0.977143,0.982759,0.979943,0.02454,0.017241,0.996271
Naive Bayes,0.961424,0.962644,0.962644,0.962644,0.039877,0.037356,0.987607


## Imbalanced Dataset

In [16]:
style_content_train_imbalanced = pd.concat([word2vec_train_imbalanced['features'], style_train_imbalanced['features']], axis=1)
style_content_test_imbalanced = pd.concat([word2vec_test_imbalanced['features'], style_test_imbalanced['features']], axis=1)

### Train

#### Logistic Regression

In [17]:
%%time
lr_style_content_imbalanced = ml.train_logistic_regression(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)
lr_style_content_imbalanced, lr_style_content_imbalanced_scaler = lr_style_content_imbalanced['model'], lr_style_content_imbalanced['scaler']

Train accuracy: 0.9955399378294364
CPU times: user 11.2 s, sys: 13.1 s, total: 24.3 s
Wall time: 1.55 s


#### Decision Tree

In [18]:
%%time
dt_style_content_imbalanced = ml.train_decision_tree(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9853358561967833
CPU times: user 1.21 s, sys: 826 ms, total: 2.03 s
Wall time: 903 ms


#### Random Forest

In [19]:
%%time
rf_style_content_imbalanced = ml.train_random_forest(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9833085552101636
CPU times: user 899 ms, sys: 2.58 ms, total: 901 ms
Wall time: 898 ms


#### Gradient Boosting

In [20]:
%%time
gb_style_content_imbalanced = ml.train_gradient_boost(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9966211650223004
CPU times: user 1min, sys: 54.8 ms, total: 1min
Wall time: 1min


#### Naive Bayes

In [21]:
%%time
nb_style_content_imbalanced = ml.train_naive_bayes(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1, remove_negatives=True)
nb_style_content_imbalanced, nb_style_content_imbalanced_scaler = nb_style_content_imbalanced['model'], nb_style_content_imbalanced['scaler']

Train accuracy: 0.9080281119070145
CPU times: user 165 ms, sys: 11.2 ms, total: 176 ms
Wall time: 31.7 ms


### Results

In [22]:
models = [lr_style_content_imbalanced, dt_style_content_imbalanced, rf_style_content_imbalanced, gb_style_content_imbalanced, nb_style_content_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [23]:
%%time
results_style_content_imbalanced = ml.multi_model_results(models, names, style_content_test_imbalanced, style_test_imbalanced['target'], lr_style_content_imbalanced_scaler, nb_style_content_imbalanced_scaler)

CPU times: user 542 ms, sys: 1.3 s, total: 1.84 s
Wall time: 127 ms


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
results_style_content_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.991622,0.971061,0.932099,0.951181,0.002666,0.067901,0.993486
Decision Tree,0.98027,0.911475,0.858025,0.883943,0.007998,0.141975,0.960973
Random Forest,0.981351,0.973978,0.808642,0.883642,0.002073,0.191358,0.994397
Gradient Boosting Tree,0.991081,0.967846,0.929012,0.948031,0.002962,0.070988,0.998247
Naive Bayes,0.912432,0.0,0.0,0.0,0.0,1.0,0.988985


Comparing these with the content-only baseline, it is obvious that there is a at small improvement with GB and RF on the balanced dataset, while all algorithms except RF improved on the imbalanced dataset.<br>
It seems that the extra features are helpful with the more robust algoritms in order to achieve better accuracy on the bigger dataset.

# Stacking

In machine learning, stacking refers to the proccess of using different learners (each one working best at learning a different part of the problem) called level 0 models as intermediate steps and then use their outputs to train another learner, called level 1 model. Thus, the final model is sometimes able to outperform the individual ones.

On this specific case, the different initial classifiers will be trained on both of the feature sets, and thus the final classifier essentially will combine information from both of them.

#### Final Classifiers

Only the three best classifiers will be used as a level 1 classifier, namely Logistic Regression (which is implemented by default), Random Forest and Gradient Boosting.

In [25]:
rf = ml.RandomForestClassifier(max_depth=5, n_estimators=20, random_state=ml.alg_random_state)
gb = ml.GradientBoostingClassifier(loss='log_loss', max_depth=3, learning_rate=0.1, random_state=ml.alg_random_state)

## Balanced Dataset

#### Train Initial Models

In [26]:
train_feature_sets_balanced = [{'name': 'style', 'features': style_train_balanced['features']}, {'name': 'word2vec', 'features': word2vec_train_balanced['features']}]
test_feature_sets_balanced = [{'name': 'style', 'features': style_test_balanced['features']}, {'name': 'word2vec', 'features': word2vec_test_balanced['features']}]

In [27]:
%%time
stacking_models_balanced = ml.train_models(train_feature_sets_balanced, style_train_balanced['target'])

CPU times: user 11.4 s, sys: 7.81 s, total: 19.2 s
Wall time: 8.13 s


### Single-algorithm

First, the stacking will be done only on the same algorithms with different feature sets, while also testing for different final_classifiers.

In [28]:
results_stacking_balanced_single = pd.DataFrame()

#### Logistic Regression

In [29]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

CPU times: user 36.9 s, sys: 16.4 s, total: 53.3 s
Wall time: 30.9 s


#### Random Forest

In [30]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

CPU times: user 38.4 s, sys: 19 s, total: 57.4 s
Wall time: 31.2 s


#### Gradient Boosting

In [31]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

CPU times: user 38.9 s, sys: 18 s, total: 56.9 s
Wall time: 31.4 s


In [32]:
results_stacking_balanced_single

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, with LogisticRegression",0.976261,0.979769,0.974138,0.976945,0.021472,0.025862,0.996739
"Algorithms: dt, with LogisticRegression",0.952522,0.961988,0.945402,0.953623,0.039877,0.054598,0.983142
"Algorithms: rf, with LogisticRegression",0.97181,0.971347,0.974138,0.97274,0.030675,0.025862,0.99561
"Algorithms: gb, with LogisticRegression",0.976261,0.971591,0.982759,0.977143,0.030675,0.017241,0.99762
"Algorithms: nb, with LogisticRegression",0.962908,0.965418,0.962644,0.964029,0.03681,0.037356,0.987712
"Algorithms: lr, with RandomForestClassifier",0.949555,0.951149,0.951149,0.951149,0.052147,0.048851,0.992027
"Algorithms: dt, with RandomForestClassifier",0.952522,0.961988,0.945402,0.953623,0.039877,0.054598,0.98368
"Algorithms: rf, with RandomForestClassifier",0.964392,0.960227,0.971264,0.965714,0.042945,0.028736,0.99561
"Algorithms: gb, with RandomForestClassifier",0.976261,0.974286,0.979885,0.977077,0.027607,0.020115,0.997043
"Algorithms: nb, with RandomForestClassifier",0.961424,0.949721,0.977011,0.963173,0.055215,0.022989,0.991736


These results are on par with the baseline models, exept for Gradient Boosting that performed somewhat worse. Random Forest achieved better results. This is probably the result of overfitting.

The 6 best performing models will be kept to compare with other stacking configurations and the complete single-algorithm results dataset will be archived.

In [33]:
results_stacking_balanced_full = results_stacking_balanced_single.copy()

In [34]:
results_stacking_balanced_best = results_stacking_balanced_single.sort_values(by=['F1 Score'], ascending = [False]).head(6)

### Multi-algorithm

Of course, it is possible to also use the outputs of more than one classifier, on both feature sets.

In [35]:
results_stacking_balanced_multi = pd.DataFrame()

#### Logistic Regression

In [36]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with LogisticRegression")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

CPU times: user 2min 26s, sys: 1min 7s, total: 3min 33s
Wall time: 1min 59s


#### Random Forest

In [37]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with RandomForestClassifier")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

CPU times: user 2min 28s, sys: 1min 8s, total: 3min 36s
Wall time: 2min


#### Gradient Boosting

In [38]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with GradientBoostingClassifier")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

CPU times: user 2min 30s, sys: 1min 11s, total: 3min 41s
Wall time: 2min 2s


In [39]:
results_stacking_balanced_multi

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998378
"Algorithms: lr, rf, gb, with LogisticRegression",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998493
"Algorithms: rf, gb, with LogisticRegression",0.977745,0.974359,0.982759,0.978541,0.027607,0.017241,0.997602
"Algorithms: lr, gb, with LogisticRegression",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998598
"Algorithms: lr, rf, with LogisticRegression",0.977745,0.971671,0.985632,0.978602,0.030675,0.014368,0.997576
"Algorithms: all, with RandomForestClassifier",0.977745,0.974359,0.982759,0.978541,0.027607,0.017241,0.997237
"Algorithms: lr, rf, gb, with RandomForestClassifier",0.98368,0.980057,0.988506,0.984263,0.021472,0.011494,0.997695
"Algorithms: rf, gb, with RandomForestClassifier",0.974777,0.966197,0.985632,0.975818,0.03681,0.014368,0.997104
"Algorithms: lr, gb, with RandomForestClassifier",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.99851
"Algorithms: lr, rf, with RandomForestClassifier",0.976261,0.974286,0.979885,0.977077,0.027607,0.020115,0.996928


As expected, using more than one classifier consistently improves the classification accuracy for both LR and RF. The best level 1 classifier seems to be Random Forest, and Logistic Regression gives better results when used as a level 0 classifier. However, all combinations performed quite well, except for some time in Gradient Boosting.

On the other hand, Naive Bayes and Decision Tree classifiers do not affect the result at all or even reduce the accuracy so from now on they will be excluded in order to reduce the execution time.

The top 10 models will be added to the best model results dataset.

In [40]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_multi.sort_values(by=['F1 Score'], ascending = [False]).head(10)])

In [41]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_multi])

### Appending all features

Another variation of stacking includes appending the predictions to the other feature sets and then train the final classifier with all the features.<br>
Since using both feature sets has been proven to improve accuracy on the baselines, the predictions will be appended to the merged feature set.

In [42]:
results_stacking_balanced_append = pd.DataFrame()

#### Logistic Regression

In [43]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with LogisticRegression (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

CPU times: user 3min 16s, sys: 1min 48s, total: 5min 4s
Wall time: 2min 31s


#### Random Forest

In [44]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with RandomForestClassifier (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

CPU times: user 3min 22s, sys: 2min 4s, total: 5min 26s
Wall time: 2min 36s


#### Gradient Boosting

In [45]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

CPU times: user 4min 5s, sys: 1min 33s, total: 5min 39s
Wall time: 3min 29s


In [46]:
results_stacking_balanced_append

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression (with appended features)",0.964392,0.962857,0.968391,0.965616,0.039877,0.031609,0.985778
"Algorithms: lr, rf, gb, with LogisticRegression (with appended features)",0.964392,0.962857,0.968391,0.965616,0.039877,0.031609,0.990044
"Algorithms: rf, gb, with LogisticRegression (with appended features)",0.962908,0.962751,0.965517,0.964132,0.039877,0.034483,0.982336
"Algorithms: lr, gb, with LogisticRegression (with appended features)",0.961424,0.957386,0.968391,0.962857,0.046012,0.031609,0.978558
"Algorithms: lr, rf, with LogisticRegression (with appended features)",0.961424,0.957386,0.968391,0.962857,0.046012,0.031609,0.981229
"Algorithms: lr, with LogisticRegression (with appended features)",0.964392,0.960227,0.971264,0.965714,0.042945,0.028736,0.990621
"Algorithms: rf, with LogisticRegression (with appended features)",0.961424,0.96,0.965517,0.962751,0.042945,0.034483,0.985456
"Algorithms: gb, with LogisticRegression (with appended features)",0.962908,0.965418,0.962644,0.964029,0.03681,0.037356,0.98498
"Algorithms: all, with RandomForestClassifier (with appended features)",0.976261,0.974286,0.979885,0.977077,0.027607,0.020115,0.996747
"Algorithms: lr, rf, gb, with RandomForestClassifier (with appended features)",0.979228,0.974432,0.985632,0.98,0.027607,0.014368,0.997611


Adding the initial feature sets to the final classifier seems to mostly harm performance on the balanced dataset. This is most likely due to overfitting, since the level 1 classifier becomes extremely specialized at recognizing the emails provided in the training set and fails to generalize for the test set.

However, when using Gradient Boosting as the final classifier, it manages at some cases to outperform the model without the appended features.

The top 12 of these models will be added to the best result dataset, for comparison.

In [47]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_append.sort_values(by=['F1 Score'], ascending = [False]).head(12)])

In [48]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_append])

### Merged Classifiers

Finally, for the sake of completeness, try stacking the level 0 classifiers that were trained with the merged dataset.

In [49]:
train_feature_sets_balanced_merged = [{'name': 'merge', 'features': style_content_train_balanced}]
test_feature_sets_balanced_merged = [{'name': 'merge', 'features': style_content_test_balanced}]

In [50]:
lr_merged_balanced = {'model' : lr_style_content_balanced, 'scaler': lr_style_content_balanced_scaler}
nb_merged_balanced = {'model' : nb_style_content_balanced, 'scaler': nb_style_content_balanced_scaler}

merged_models_balanced = [{'name' : 'lr', 'features' : 'merge', 'model' : lr_merged_balanced},
                          {'name' : 'dt', 'features' : 'merge', 'model' : dt_style_content_balanced},
                          {'name' : 'rf', 'features' : 'merge', 'model' : rf_style_content_balanced},
                          {'name' : 'gb', 'features' : 'merge', 'model' : gb_style_content_balanced},
                          {'name' : 'nb', 'features' : 'merge', 'model' : nb_merged_balanced}]

In [51]:
results_stacking_balanced_merged = pd.DataFrame()

#### Logistic Regression

In [52]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

CPU times: user 3min 3s, sys: 28.5 s, total: 3min 31s
Wall time: 2min 49s


#### Random Forest

In [53]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

CPU times: user 2min 59s, sys: 20.1 s, total: 3min 19s
Wall time: 2min 50s


#### Gradient Boosting

In [54]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

CPU times: user 3min 24s, sys: 18.9 s, total: 3min 43s
Wall time: 3min 16s


In [55]:
results_stacking_balanced_merged

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, rf, gb merged, with LogisticRegression",0.980712,0.979943,0.982759,0.981349,0.021472,0.017241,0.996959
"Algorithms: rf, gb merged, with LogisticRegression",0.976261,0.974286,0.979885,0.977077,0.027607,0.020115,0.996421
"Algorithms: lr, gb merged, with LogisticRegression",0.979228,0.977143,0.982759,0.979943,0.02454,0.017241,0.996474
"Algorithms: rf, lr merged, with LogisticRegression",0.974777,0.97151,0.979885,0.97568,0.030675,0.020115,0.996509
"Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)",0.97181,0.968661,0.977011,0.972818,0.033742,0.022989,0.97866
"Algorithms: rf, gb merged, with LogisticRegression (with appended features)",0.964392,0.957627,0.974138,0.965812,0.046012,0.025862,0.979312
"Algorithms: lr, gb merged, with LogisticRegression (with appended features)",0.965875,0.965616,0.968391,0.967001,0.03681,0.031609,0.983067
"Algorithms: rf, lr merged, with LogisticRegression (with appended features)",0.964392,0.955056,0.977011,0.965909,0.04908,0.022989,0.989987
"Algorithms: lr, rf, gb merged, with RandomForestClassifier",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.997356
"Algorithms: rf, gb merged, with RandomForestClassifier",0.979228,0.974432,0.985632,0.98,0.027607,0.014368,0.99662


In general, the addition of the initial features on the level 1 classifier gives better results, but, while consistent, the results were not great. The results of the single level 0 classifiers were surprisingly good though for Logistic Regression and Gradient Boosting.

This is likely because the level 0 classifiers were more specialized compared to training on both feature sets separately.

The top 13 results will be added to the dataset for comparison.

In [56]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_merged.sort_values(by=['F1 Score'], ascending = [False]).head(10)])

In [57]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_merged])

In [58]:
results_stacking_balanced_best.sort_values(by=['F1 Score'], ascending = [False]).head(13)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, rf, gb, with RandomForestClassifier",0.98368,0.980057,0.988506,0.984263,0.021472,0.011494,0.997695
"Algorithms: lr, with GradientBoostingClassifier (with appended features)",0.98368,0.980057,0.988506,0.984263,0.021472,0.011494,0.997611
"Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)",0.98368,0.982808,0.985632,0.984218,0.018405,0.014368,0.996783
"Algorithms: lr, gb, with GradientBoostingClassifier (with appended features)",0.982196,0.98,0.985632,0.982808,0.021472,0.014368,0.99829
"Algorithms: gb, with GradientBoostingClassifier (with appended features)",0.982196,0.98,0.985632,0.982808,0.021472,0.014368,0.997893
"Algorithms: rf, gb, with GradientBoostingClassifier (with appended features)",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.997461
"Algorithms: lr, gb merged, with RandomForestClassifier (with appended features)",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.995156
"Algorithms: all, with GradientBoostingClassifier (with appended features)",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998413
"Algorithms: lr, rf, gb, with GradientBoostingClassifier (with appended features)",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998131
"Algorithms: all, with LogisticRegression",0.980712,0.977208,0.985632,0.981402,0.02454,0.014368,0.998378


## Imbalanced Dataset

#### Train Initial Models

In [59]:
train_feature_sets_imbalanced = [{'name': 'style', 'features': style_train_imbalanced['features']}, {'name': 'word2vec', 'features': word2vec_train_imbalanced['features']}]
test_feature_sets_imbalanced = [{'name': 'style', 'features': style_test_imbalanced['features']}, {'name': 'word2vec', 'features': word2vec_test_imbalanced['features']}]

In [60]:
%%time
stacking_models_imbalanced = ml.train_models(train_feature_sets_imbalanced, style_train_imbalanced['target'])

CPU times: user 52.2 s, sys: 8.44 s, total: 1min
Wall time: 48.7 s


### Single-algorithm

In [61]:
results_stacking_imbalanced_single = pd.DataFrame()

#### Logistic Regression

In [62]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

CPU times: user 3min 26s, sys: 37.8 s, total: 4min 4s
Wall time: 3min 10s


#### Random Forest

In [63]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

CPU times: user 3min 27s, sys: 32.6 s, total: 3min 59s
Wall time: 3min 12s


#### Gradient Boosting

In [64]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

CPU times: user 3min 33s, sys: 34.8 s, total: 4min 8s
Wall time: 3min 16s


In [65]:
results_stacking_imbalanced_single

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, with LogisticRegression",0.99,0.970492,0.91358,0.941176,0.002666,0.08642,0.993451
"Algorithms: dt, with LogisticRegression",0.983514,0.925566,0.882716,0.903633,0.006813,0.117284,0.98128
"Algorithms: rf, with LogisticRegression",0.988108,0.954545,0.907407,0.93038,0.004147,0.092593,0.993587
"Algorithms: gb, with LogisticRegression",0.991351,0.962025,0.938272,0.95,0.003555,0.061728,0.997418
"Algorithms: nb, with LogisticRegression",0.972973,0.888889,0.790123,0.836601,0.009479,0.209877,0.989689
"Algorithms: lr, with RandomForestClassifier",0.989189,0.964052,0.910494,0.936508,0.003258,0.089506,0.993074
"Algorithms: dt, with RandomForestClassifier",0.984595,0.934853,0.885802,0.909667,0.005924,0.114198,0.983599
"Algorithms: rf, with RandomForestClassifier",0.985946,0.935897,0.901235,0.918239,0.005924,0.098765,0.992394
"Algorithms: gb, with RandomForestClassifier",0.99,0.944272,0.941358,0.942813,0.005332,0.058642,0.998435
"Algorithms: nb, with RandomForestClassifier",0.979189,0.88,0.882716,0.881356,0.011552,0.117284,0.991322


The results were somewhat consistent and slightly better at some cases with the results of the imbalanced dataset. Random Forest showed significant improvement and even NB managed to classify something. Compared to the merged feature sets, RF performed significantly better and GB too managed to outperformed it.

In [66]:
results_stacking_imbalanced_full = results_stacking_imbalanced_single.copy()

In [67]:
results_stacking_imbalanced_best = results_stacking_imbalanced_single.sort_values(by=['F1 Score'], ascending = [False]).head(6)

### Multi-algorithm

In [68]:
results_stacking_imbalanced_multi = pd.DataFrame()

#### Logistic Regression

In [69]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with LogisticRegression")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

CPU times: user 13min 52s, sys: 2min 49s, total: 16min 42s
Wall time: 12min 41s


#### Random Forest

In [70]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with RandomForestClassifier")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

CPU times: user 13min 49s, sys: 2min 25s, total: 16min 15s
Wall time: 12min 44s


#### Gradient Boosting

In [71]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with GradientBoostingClassifier")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

CPU times: user 13min 57s, sys: 2min 22s, total: 16min 20s
Wall time: 12min 54s


In [72]:
results_stacking_imbalanced_multi

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression",0.993784,0.983923,0.944444,0.96378,0.001481,0.055556,0.996701
"Algorithms: lr, rf, gb, with LogisticRegression",0.993243,0.974603,0.947531,0.960876,0.00237,0.052469,0.996843
"Algorithms: rf, gb, with LogisticRegression",0.989459,0.946708,0.932099,0.939347,0.005036,0.067901,0.996567
"Algorithms: lr, gb, with LogisticRegression",0.993514,0.980769,0.944444,0.962264,0.001777,0.055556,0.997376
"Algorithms: lr, rf, with LogisticRegression",0.991351,0.974026,0.925926,0.949367,0.00237,0.074074,0.995751
"Algorithms: all, with RandomForestClassifier",0.993243,0.974603,0.947531,0.960876,0.00237,0.052469,0.998764
"Algorithms: lr, rf, gb, with RandomForestClassifier",0.993243,0.980707,0.941358,0.96063,0.001777,0.058642,0.998818
"Algorithms: rf, gb, with RandomForestClassifier",0.99,0.94704,0.938272,0.942636,0.005036,0.061728,0.998558
"Algorithms: lr, gb, with RandomForestClassifier",0.992973,0.974522,0.944444,0.959248,0.00237,0.055556,0.998934
"Algorithms: lr, rf, with RandomForestClassifier",0.99027,0.964516,0.92284,0.943218,0.003258,0.07716,0.998321


Of course, these models performed better on average than the stacking only of different feature sets. On the imbalanced dataset, Naive Bayes and Decision Tree did have more impact on the prediction accuracy.

In [73]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_multi.sort_values(by=['F1 Score'], ascending = [False]).head(13)])

In [74]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_multi])

### Appending all features

In [75]:
results_stacking_imbalanced_append = pd.DataFrame()

#### Logistic Regression

In [76]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with LogisticRegression (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

CPU times: user 18min 44s, sys: 4min 28s, total: 23min 12s
Wall time: 16min


#### Random Forest

In [77]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

CPU times: user 17min 19s, sys: 3min 16s, total: 20min 36s
Wall time: 15min 56s


#### Gradient Boosting

In [78]:
%%time
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

# Single level 0
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

CPU times: user 23min 29s, sys: 2min 51s, total: 26min 21s
Wall time: 22min 16s


In [79]:
results_stacking_imbalanced_append

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression (with appended features)",0.992703,0.974441,0.941358,0.957614,0.00237,0.058642,0.998264
"Algorithms: lr, rf, gb, with LogisticRegression (with appended features)",0.993243,0.980707,0.941358,0.96063,0.001777,0.058642,0.994992
"Algorithms: rf, gb, with LogisticRegression (with appended features)",0.992162,0.974277,0.935185,0.954331,0.00237,0.064815,0.99365
"Algorithms: lr, gb, with LogisticRegression (with appended features)",0.992973,0.977564,0.941358,0.959119,0.002073,0.058642,0.994877
"Algorithms: lr, rf, with LogisticRegression (with appended features)",0.992973,0.980645,0.938272,0.958991,0.001777,0.061728,0.995784
"Algorithms: lr, with LogisticRegression (with appended features)",0.991622,0.971061,0.932099,0.951181,0.002666,0.067901,0.994488
"Algorithms: rf, with LogisticRegression (with appended features)",0.992703,0.980583,0.935185,0.957346,0.001777,0.064815,0.994433
"Algorithms: gb, with LogisticRegression (with appended features)",0.993514,0.983871,0.941358,0.962145,0.001481,0.058642,0.993473
"Algorithms: all, with RandomForestClassifier (with appended features)",0.991892,0.968153,0.938272,0.952978,0.002962,0.061728,0.99729
"Algorithms: lr, rf, gb, with RandomForestClassifier (with appended features)",0.992703,0.983713,0.932099,0.957211,0.001481,0.067901,0.997017


Adding the initial feature sets to the final classifier also mostly harms performance on the imbalanced dataset. The best performing model now only barely performed better than without the features. The algorithm that performed better was Gradient Boosting. Also, there were some single-classifier models in the top performing ones.

In [80]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_append.sort_values(by=['F1 Score'], ascending = [False]).head(12)])

In [81]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_append])

### Merged Classifiers

In [82]:
train_feature_sets_imbalanced_merged = [{'name': 'merge', 'features': style_content_train_imbalanced}]
test_feature_sets_imbalanced_merged = [{'name': 'merge', 'features': style_content_test_imbalanced}]

In [83]:
lr_merged_imbalanced = {'model' : lr_style_content_imbalanced, 'scaler': lr_style_content_imbalanced_scaler}
nb_merged_imbalanced = {'model' : nb_style_content_imbalanced, 'scaler': nb_style_content_imbalanced_scaler}

merged_models_imbalanced = [{'name' : 'lr', 'features' : 'merge', 'model' : lr_merged_imbalanced},
                          {'name' : 'dt', 'features' : 'merge', 'model' : dt_style_content_imbalanced},
                          {'name' : 'rf', 'features' : 'merge', 'model' : rf_style_content_imbalanced},
                          {'name' : 'gb', 'features' : 'merge', 'model' : gb_style_content_imbalanced},
                          {'name' : 'nb', 'features' : 'merge', 'model' : nb_merged_imbalanced}]

In [84]:
results_stacking_imbalanced_merged = pd.DataFrame()

#### Logistic Regression

In [85]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

CPU times: user 22min 45s, sys: 4min 52s, total: 27min 37s
Wall time: 19min 9s


#### Random Forest

In [86]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

CPU times: user 22min 29s, sys: 4min 53s, total: 27min 22s
Wall time: 19min 19s


#### Gradient Boosting

In [87]:
%%time
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

CPU times: user 26min 51s, sys: 6min 27s, total: 33min 18s
Wall time: 22min 38s


In [88]:
results_stacking_imbalanced_merged

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, rf, gb merged, with LogisticRegression",0.992703,0.974441,0.941358,0.957614,0.00237,0.058642,0.998603
"Algorithms: rf, gb merged, with LogisticRegression",0.990811,0.964744,0.929012,0.946541,0.003258,0.070988,0.997282
"Algorithms: lr, gb merged, with LogisticRegression",0.992162,0.968254,0.941358,0.954617,0.002962,0.058642,0.999003
"Algorithms: rf, lr merged, with LogisticRegression",0.991892,0.977273,0.929012,0.952532,0.002073,0.070988,0.997752
"Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)",0.991081,0.964856,0.932099,0.948195,0.003258,0.067901,0.993531
"Algorithms: rf, gb merged, with LogisticRegression (with appended features)",0.992703,0.971429,0.944444,0.957746,0.002666,0.055556,0.992919
"Algorithms: lr, gb merged, with LogisticRegression (with appended features)",0.990541,0.95873,0.932099,0.945227,0.003851,0.067901,0.993459
"Algorithms: rf, lr merged, with LogisticRegression (with appended features)",0.989459,0.961165,0.916667,0.938389,0.003555,0.083333,0.993355
"Algorithms: lr, rf, gb merged, with RandomForestClassifier",0.992432,0.968354,0.944444,0.95625,0.002962,0.055556,0.998952
"Algorithms: rf, gb merged, with RandomForestClassifier",0.991081,0.950464,0.947531,0.948995,0.004739,0.052469,0.998049


These models did not perform as well as the previous ones, but in general were better than the baseline with merged features.

In [89]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_merged.sort_values(by=['F1 Score'], ascending = [False]).head(10)])

In [90]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_merged])

In [91]:
results_stacking_imbalanced_best.sort_values(by=['F1 Score'], ascending = [False]).head(13)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, gb, with GradientBoostingClassifier (with appended features)",0.994054,0.977848,0.953704,0.965625,0.002073,0.046296,0.999021
"Algorithms: lr, gb, with GradientBoostingClassifier",0.993784,0.974763,0.953704,0.964119,0.00237,0.046296,0.998875
"Algorithms: all, with GradientBoostingClassifier (with appended features)",0.993784,0.980831,0.947531,0.963893,0.001777,0.052469,0.999175
"Algorithms: all, with LogisticRegression",0.993784,0.983923,0.944444,0.96378,0.001481,0.055556,0.996701
"Algorithms: lr, gb, with LogisticRegression",0.993514,0.980769,0.944444,0.962264,0.001777,0.055556,0.997376
"Algorithms: gb, with LogisticRegression (with appended features)",0.993514,0.983871,0.941358,0.962145,0.001481,0.058642,0.993473
"Algorithms: lr, with GradientBoostingClassifier (with appended features)",0.993514,0.983871,0.941358,0.962145,0.001481,0.058642,0.998843
"Algorithms: lr, rf, with GradientBoostingClassifier (with appended features)",0.993243,0.971609,0.950617,0.960998,0.002666,0.049383,0.998888
"Algorithms: lr, rf, gb, with LogisticRegression",0.993243,0.974603,0.947531,0.960876,0.00237,0.052469,0.996843
"Algorithms: all, with RandomForestClassifier",0.993243,0.974603,0.947531,0.960876,0.00237,0.052469,0.998764
