In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml
from preprocessing import separate_features_target

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = {
    'stylometric' : ['style_train_balanced.csv','style_train_imbalanced.csv'],
    'word2vec' : ['word2vec_train_balanced.csv','word2vec_train_imbalanced.csv']
}
test = {
    'stylometric' : ['style_test_balanced.csv','style_test_imbalanced.csv'],
    'word2vec' : ['word2vec_test_balanced.csv','word2vec_test_imbalanced.csv']
}

## Import Data

Since Word2Vec features outperformed the TF-IDF features, only those will be used to test the combination with content features.

### Balanced Dataset

In [3]:
style_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
style_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

word2vec_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
word2vec_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

In [4]:
style_train_balanced = separate_features_target(style_train_balanced_complete)
style_test_balanced = separate_features_target(style_test_balanced_complete)

word2vec_train_balanced = separate_features_target(word2vec_train_balanced_complete)
word2vec_test_balanced = separate_features_target(word2vec_test_balanced_complete)

### Imbalanced Dataset

In [5]:
style_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
style_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

word2vec_train_imbalanced_complete = pd.read_csv(os.path.join(csv_path, train['word2vec'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})
word2vec_test_imbalanced_complete = pd.read_csv(os.path.join(csv_path, test['word2vec'][1]), index_col=0, dtype={'email_class': 'bool', 'email_id': 'int16'})

In [6]:
style_train_imbalanced = separate_features_target(style_train_imbalanced_complete)
style_test_imbalanced = separate_features_target(style_test_imbalanced_complete)

word2vec_train_imbalanced = separate_features_target(word2vec_train_imbalanced_complete)
word2vec_test_imbalanced = separate_features_target(word2vec_test_imbalanced_complete)

# Merging Feature Sets

The simplest way of combining the information of the two different feature sets is to simply merge them into one set and then perform the predictions based on this concatenated set.

## Balanced Dataset

In [7]:
style_content_train_balanced = pd.concat([word2vec_train_balanced['features'], style_train_balanced['features']], axis=1)
style_content_test_balanced = pd.concat([word2vec_test_balanced['features'], style_test_balanced['features']], axis=1)

### Train

#### Logistic Regression

In [8]:
lr_style_content_balanced = ml.train_logistic_regression(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)
lr_style_content_balanced, lr_style_content_balanced_scaler = lr_style_content_balanced['model'], lr_style_content_balanced['scaler']

Train accuracy: 0.9944320712694877


#### Decision Tree

In [9]:
dt_style_content_balanced = ml.train_decision_tree(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9862657757980697


#### Random Forest

In [10]:
rf_style_content_balanced = ml.train_random_forest(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9844097995545658


#### Gradient Boosting

In [11]:
gb_style_content_balanced = ml.train_gradient_boost(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9974016332590943


#### Naive Bayes

In [12]:
nb_style_content_balanced = ml.train_naive_bayes(style_content_train_balanced, style_train_balanced['target'], show_train_accuracy=1, remove_negatives=True)
nb_style_content_balanced, nb_style_content_balanced_scaler = nb_style_content_balanced['model'], nb_style_content_balanced['scaler']

Train accuracy: 0.9510022271714922


### Results

In [13]:
models = [lr_style_content_balanced, dt_style_content_balanced, rf_style_content_balanced, gb_style_content_balanced, nb_style_content_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [14]:
results_style_content_balanced = ml.multi_model_results(models, names, style_content_test_balanced, style_test_balanced['target'], lr_style_content_balanced_scaler, nb_style_content_balanced_scaler)

In [15]:
results_style_content_balanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.965875,0.955182,0.979885,0.967376,0.04908,0.020115,0.9916
Decision Tree,0.951039,0.951289,0.954023,0.952654,0.052147,0.045977,0.95892
Random Forest,0.968843,0.957983,0.982759,0.970213,0.046012,0.017241,0.993936
Gradient Boosting Tree,0.967359,0.960452,0.977011,0.968661,0.042945,0.022989,0.995901
Naive Bayes,0.936202,0.957958,0.916667,0.936858,0.042945,0.083333,0.978871


## Imbalanced Dataset

In [16]:
style_content_train_imbalanced = pd.concat([word2vec_train_imbalanced['features'], style_train_imbalanced['features']], axis=1)
style_content_test_imbalanced = pd.concat([word2vec_test_imbalanced['features'], style_test_imbalanced['features']], axis=1)

### Train

#### Logistic Regression

In [17]:
lr_style_content_imbalanced = ml.train_logistic_regression(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)
lr_style_content_imbalanced, lr_style_content_imbalanced_scaler = lr_style_content_imbalanced['model'], lr_style_content_imbalanced['scaler']

Train accuracy: 0.9927692931477227


#### Decision Tree

In [18]:
dt_style_content_imbalanced = ml.train_decision_tree(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9833085552101636


#### Random Forest

In [19]:
rf_style_content_imbalanced = ml.train_random_forest(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9846600892012434


#### Gradient Boosting

In [20]:
gb_style_content_imbalanced = ml.train_gradient_boost(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9947965941343425


#### Naive Bayes

In [21]:
nb_style_content_imbalanced = ml.train_naive_bayes(style_content_train_imbalanced, style_train_imbalanced['target'], show_train_accuracy=1, remove_negatives=True)
nb_style_content_imbalanced, nb_style_content_imbalanced_scaler = nb_style_content_imbalanced['model'], nb_style_content_imbalanced['scaler']

Train accuracy: 0.9080281119070145


### Results

In [22]:
models = [lr_style_content_imbalanced, dt_style_content_imbalanced, rf_style_content_imbalanced, gb_style_content_imbalanced, nb_style_content_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [23]:
results_style_content_imbalanced = ml.multi_model_results(models, names, style_content_test_imbalanced, style_test_imbalanced['target'], lr_style_content_imbalanced_scaler, nb_style_content_imbalanced_scaler)

In [24]:
results_style_content_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.988108,0.940252,0.92284,0.931464,0.005628,0.07716,0.993472
Decision Tree,0.977838,0.903333,0.83642,0.86859,0.00859,0.16358,0.97308
Random Forest,0.979459,0.955882,0.802469,0.872483,0.003555,0.197531,0.99333
Gradient Boosting Tree,0.988649,0.946203,0.92284,0.934375,0.005036,0.07716,0.997138
Naive Bayes,0.912703,1.0,0.003086,0.006154,0.0,0.996914,0.984712


Comparing these with the content-only baseline, it is obvious that there is a at least a small improvement in the balanced dataset and a more significant improvement in the imbalanced dataset.<br>
It seems that the extra features are helpful in order to achieve better accuracy on the bigger dataset.

Another interesting observation is that the false negative rate of the two best performing algorithms is the same. This means that they both did not detect the same number of phishing emails (25).

In [25]:
lr_dt_predictions = ml.results_by_id([lr_style_content_imbalanced, gb_style_content_imbalanced], ['lr', 'gb'], pd.concat([style_test_imbalanced_complete[['email_id', 'email_class']],  style_content_test_imbalanced], axis=1), style_test_imbalanced_complete['email_id'], lr_style_content_imbalanced_scaler)
lr_dt_predictions[(lr_dt_predictions['True Class'] == True) & ((lr_dt_predictions['lr'] == False) & (lr_dt_predictions['gb'] == False))].shape[0]

15

15 of them were missclassified by both algorithms.

# Stacking

In machine learning, stacking refers to the proccess of using different learners (each one working best at learning a different part of the problem) called level 0 models as intermediate steps and then use their outputs to train another learner, called level 1 model. Thus, the final model is sometimes able to outperform the individual ones.

On this specific case, the different initial classifiers will be trained on both of the feature sets, and thus the final classifier essentially will combine information from both of them.

#### Final Classifiers

Only the three best classifiers will be used as a level 1 classifier, namely Logistic Regression (which is implemented by default), Random Forest and Gradient Boosting.

In [26]:
rf = ml.RandomForestClassifier(max_depth=5, n_estimators=20, random_state=ml.alg_random_state)
gb = ml.GradientBoostingClassifier(loss='log_loss', max_depth=3, learning_rate=0.1, random_state=ml.alg_random_state)

## Balanced Dataset

#### Train Initial Models

In [27]:
train_feature_sets_balanced = [{'name': 'style', 'features': style_train_balanced['features']}, {'name': 'word2vec', 'features': word2vec_train_balanced['features']}]
test_feature_sets_balanced = [{'name': 'style', 'features': style_test_balanced['features']}, {'name': 'word2vec', 'features': word2vec_test_balanced['features']}]

In [28]:
stacking_models_balanced = ml.train_models(train_feature_sets_balanced, style_train_balanced['target'])

### Single-algorithm

First, the stacking will be done only on the same algorithms with different feature sets, while also testing for different final_classifiers.

In [29]:
results_stacking_balanced_single = pd.DataFrame()

#### Logistic Regression

In [30]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

#### Random Forest

In [31]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

#### Gradient Boosting

In [32]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_balanced_single = pd.concat([results_stacking_balanced_single, stacked_preds['results']])

In [33]:
results_stacking_balanced_single

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, with LogisticRegression",0.965875,0.957746,0.977011,0.967283,0.046012,0.022989,0.993557
"Algorithms: dt, with LogisticRegression",0.956973,0.954416,0.962644,0.958512,0.04908,0.037356,0.982252
"Algorithms: rf, with LogisticRegression",0.970326,0.960674,0.982759,0.971591,0.042945,0.017241,0.992102
"Algorithms: gb, with LogisticRegression",0.974777,0.963585,0.988506,0.975887,0.039877,0.011494,0.995751
"Algorithms: nb, with LogisticRegression",0.940653,0.950292,0.933908,0.942029,0.052147,0.066092,0.980026
"Algorithms: lr, with RandomForestClassifier",0.968843,0.960563,0.979885,0.970128,0.042945,0.020115,0.995099
"Algorithms: dt, with RandomForestClassifier",0.962908,0.952381,0.977011,0.964539,0.052147,0.022989,0.985328
"Algorithms: rf, with RandomForestClassifier",0.968843,0.960563,0.979885,0.970128,0.042945,0.020115,0.99245
"Algorithms: gb, with RandomForestClassifier",0.976261,0.963687,0.991379,0.977337,0.039877,0.008621,0.995707
"Algorithms: nb, with RandomForestClassifier",0.940653,0.952941,0.931034,0.94186,0.04908,0.068966,0.985654


It is apparent that the results are better than the baseline models, and at some cases (when Random Forest of Gradient Boosting is used in at least one of the steps) even better than predicting with the merged feature sets.

The 6 best performing models will be kept to compare with other stacking configurations and the complete single-algorithm results dataset will be archived.

In [34]:
results_stacking_balanced_full = results_stacking_balanced_single.copy()

In [35]:
results_stacking_balanced_best = results_stacking_balanced_single.sort_values(by=['F1 Score'], ascending = [False]).head(6)

### Multi-algorithm

Of course, it is possible to also use the outputs of more than one classifier, on both feature sets.

In [36]:
results_stacking_balanced_multi = pd.DataFrame()

#### Logistic Regression

In [37]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with LogisticRegression")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

#### Random Forest

In [38]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with RandomForestClassifier")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

#### Gradient Boosting

In [39]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with GradientBoostingClassifier")
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_balanced_multi = pd.concat([results_stacking_balanced_multi, stacked_preds['results']])

In [40]:
results_stacking_balanced_multi

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression",0.976261,0.966292,0.988506,0.977273,0.03681,0.011494,0.995954
"Algorithms: rf, lr, gb, with LogisticRegression",0.976261,0.966292,0.988506,0.977273,0.03681,0.011494,0.995989
"Algorithms: rf, gb, with LogisticRegression",0.974777,0.963585,0.988506,0.975887,0.039877,0.011494,0.99487
"Algorithms: lr, gb, with LogisticRegression",0.976261,0.968927,0.985632,0.977208,0.033742,0.014368,0.99643
"Algorithms: rf, lr, with LogisticRegression",0.974777,0.963585,0.988506,0.975887,0.039877,0.011494,0.995037
"Algorithms: all, with RandomForestClassifier",0.974777,0.963585,0.988506,0.975887,0.039877,0.011494,0.995209
"Algorithms: rf, lr, gb, with RandomForestClassifier",0.977745,0.966387,0.991379,0.978723,0.03681,0.008621,0.99613
"Algorithms: rf, gb, with RandomForestClassifier",0.974777,0.963585,0.988506,0.975887,0.039877,0.011494,0.993565
"Algorithms: lr, gb, with RandomForestClassifier",0.976261,0.963687,0.991379,0.977337,0.039877,0.008621,0.996598
"Algorithms: rf, lr, with RandomForestClassifier",0.973294,0.966102,0.982759,0.974359,0.03681,0.017241,0.995632


As expected, using more than one classifier consistently improves the classification accuracy. The best level 1 classifier seems to be Gradient Boosting, and Logistic Regression gives better results when used as a level 0 classifier. However, all combinations performed quite well.

On the other hand, Naive Bayes and Decision Tree classifiers do not affect the result at all or even reduce the accuracy when used with the Random Forest Classifier, so from now on they will be excluded in order to reduce the execution time.

The top 8 models will be added to the best model results dataset.

In [41]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_multi.sort_values(by=['F1 Score'], ascending = [False]).head(8)])

In [42]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_multi])

### Appending all features

Another variation of stacking includes appending the predictions to the other feature sets and then train the final classifier with all the features.<br>
Since using both feature sets has been proven to improve accuracy on the baselines, the predictions will be appended to the merged feature set.

In [43]:
results_stacking_balanced_append = pd.DataFrame()

#### Logistic Regression

In [44]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with LogisticRegression (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

#### Random Forest

In [45]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with RandomForestClassifier (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

#### Gradient Boosting

In [46]:
stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_balanced, train_feature_sets_balanced, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_balanced, test_feature_sets_balanced, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_balanced_append = pd.concat([results_stacking_balanced_append, stacked_preds['results']])

In [47]:
results_stacking_balanced_append

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression (with appended features)",0.959941,0.94958,0.974138,0.961702,0.055215,0.025862,0.976024
"Algorithms: rf, lr, gb, with LogisticRegression (with appended features)",0.958457,0.949438,0.971264,0.960227,0.055215,0.028736,0.975125
"Algorithms: rf, gb, with LogisticRegression (with appended features)",0.956973,0.951841,0.965517,0.958631,0.052147,0.034483,0.974006
"Algorithms: lr, gb, with LogisticRegression (with appended features)",0.951039,0.948718,0.956897,0.95279,0.055215,0.043103,0.97873
"Algorithms: rf, lr, with LogisticRegression (with appended features)",0.958457,0.946927,0.974138,0.96034,0.058282,0.025862,0.971864
"Algorithms: all, with RandomForestClassifier (with appended features)",0.970326,0.960674,0.982759,0.971591,0.042945,0.017241,0.995857
"Algorithms: rf, lr, gb, with RandomForestClassifier (with appended features)",0.974777,0.966197,0.985632,0.975818,0.03681,0.014368,0.995884
"Algorithms: rf, gb, with RandomForestClassifier (with appended features)",0.968843,0.963173,0.977011,0.970043,0.039877,0.022989,0.994575
"Algorithms: lr, gb, with RandomForestClassifier (with appended features)",0.968843,0.963173,0.977011,0.970043,0.039877,0.022989,0.994548
"Algorithms: rf, lr, with RandomForestClassifier (with appended features)",0.973294,0.966102,0.982759,0.974359,0.03681,0.017241,0.995249


Adding the initial feature sets to the final classifier seems to mostly harm performance on the balanced dataset. This is most likely due to overfitting, since the level 1 classifier becomes extremely specialized at recognizing the emails provided in the training set and fails to generalize for the test set.

However, when using Gradient Boosting as the final classifier, it manages at some cases to outperform the model without the appended features. This is no surprise, since Boosting methods in general are somewhat more resistant to overfitting.

The top 6 of these models will be added to the best result dataset, for comparison.

In [48]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_append.sort_values(by=['F1 Score'], ascending = [False]).head(6)])

In [49]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_append])

### Merged Classifiers

Finally, for the sake of completeness, try stacking the level 0 classifiers that were trained with the merged dataset.

In [50]:
train_feature_sets_balanced_merged = [{'name': 'merge', 'features': style_content_train_balanced}]
test_feature_sets_balanced_merged = [{'name': 'merge', 'features': style_content_test_balanced}]

In [51]:
lr_merged_balanced = {'model' : lr_style_content_balanced, 'scaler': lr_style_content_balanced_scaler}
nb_merged_balanced = {'model' : nb_style_content_balanced, 'scaler': nb_style_content_balanced_scaler}

merged_models_balanced = [{'name' : 'lr', 'features' : 'merge', 'model' : lr_merged_balanced},
                          {'name' : 'dt', 'features' : 'merge', 'model' : dt_style_content_balanced},
                          {'name' : 'rf', 'features' : 'merge', 'model' : rf_style_content_balanced},
                          {'name' : 'gb', 'features' : 'merge', 'model' : gb_style_content_balanced},
                          {'name' : 'nb', 'features' : 'merge', 'model' : nb_merged_balanced}]

In [52]:
results_stacking_balanced_merged = pd.DataFrame()

#### Logistic Regression

In [53]:
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with LogisticRegression")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with LogisticRegression (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

#### Random Forest

In [54]:
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

#### Gradient Boosting

In [55]:
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_balanced, train_feature_sets_balanced_merged, style_train_balanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_balanced, test_feature_sets_balanced_merged, style_test_balanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)")
results_stacking_balanced_merged = pd.concat([results_stacking_balanced_merged, stacked_preds['results']])

In [56]:
results_stacking_balanced_merged

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, rf, gb merged, with LogisticRegression",0.974777,0.966197,0.985632,0.975818,0.03681,0.014368,0.996747
"Algorithms: rf, gb merged, with LogisticRegression",0.97181,0.960784,0.985632,0.97305,0.042945,0.014368,0.995355
"Algorithms: lr, gb merged, with LogisticRegression",0.973294,0.966102,0.982759,0.974359,0.03681,0.017241,0.997144
"Algorithms: rf, lr merged, with LogisticRegression",0.97181,0.96338,0.982759,0.972973,0.039877,0.017241,0.996421
"Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)",0.95549,0.946629,0.968391,0.957386,0.058282,0.031609,0.975848
"Algorithms: rf, gb merged, with LogisticRegression (with appended features)",0.95549,0.946629,0.968391,0.957386,0.058282,0.031609,0.974076
"Algorithms: lr, gb merged, with LogisticRegression (with appended features)",0.958457,0.951977,0.968391,0.960114,0.052147,0.031609,0.973186
"Algorithms: rf, lr merged, with LogisticRegression (with appended features)",0.951039,0.943662,0.962644,0.953058,0.06135,0.037356,0.973891
"Algorithms: lr, rf, gb merged, with RandomForestClassifier",0.965875,0.947658,0.988506,0.967651,0.058282,0.011494,0.995394
"Algorithms: rf, gb merged, with RandomForestClassifier",0.965875,0.96034,0.974138,0.96719,0.042945,0.025862,0.996245


In general, the addition of the initial features on the level 1 classifier gives better results, but even the best performance was worse than the previous best.

This is likely because the level 0 classifiers were more specialized compared to training on both feature sets separately.

The top 10 results will be added to the dataset for comparison.

In [57]:
results_stacking_balanced_best = pd.concat([results_stacking_balanced_best, results_stacking_balanced_merged.sort_values(by=['F1 Score'], ascending = [False]).head(10)])

In [58]:
results_stacking_balanced_full = pd.concat([results_stacking_balanced_full, results_stacking_balanced_merged])

In [59]:
results_stacking_balanced_best.sort_values(by=['F1 Score'], ascending = [False]).head(6)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, gb, with GradientBoostingClassifier (with appended features)",0.980712,0.971831,0.991379,0.981508,0.030675,0.008621,0.996783
"Algorithms: rf, lr, gb, with GradientBoostingClassifier (with appended features)",0.980712,0.971831,0.991379,0.981508,0.030675,0.008621,0.996598
"Algorithms: all, with GradientBoostingClassifier (with appended features)",0.979228,0.969101,0.991379,0.980114,0.033742,0.008621,0.996201
"Algorithms: lr, gb, with GradientBoostingClassifier",0.979228,0.969101,0.991379,0.980114,0.033742,0.008621,0.996307
"Algorithms: lr, gb merged, with GradientBoostingClassifier (with appended features)",0.979228,0.971751,0.988506,0.980057,0.030675,0.011494,0.996668
"Algorithms: rf, lr, with GradientBoostingClassifier (with appended features)",0.979228,0.971751,0.988506,0.980057,0.030675,0.011494,0.996342


## Imbalanced Dataset

#### Train Initial Models

In [60]:
train_feature_sets_imbalanced = [{'name': 'style', 'features': style_train_imbalanced['features']}, {'name': 'word2vec', 'features': word2vec_train_imbalanced['features']}]
test_feature_sets_imbalanced = [{'name': 'style', 'features': style_test_imbalanced['features']}, {'name': 'word2vec', 'features': word2vec_test_imbalanced['features']}]

In [61]:
stacking_models_imbalanced = ml.train_models(train_feature_sets_imbalanced, style_train_imbalanced['target'])

### Single-algorithm

In [62]:
results_stacking_imbalanced_single = pd.DataFrame()

#### Logistic Regression

In [63]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

#### Random Forest

In [64]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

#### Gradient Boosting

In [65]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'rf', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'rf', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'gb', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'gb', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'nb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['lr', 'dt', 'rf', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['lr', 'dt', 'rf', 'gb'])
results_stacking_imbalanced_single = pd.concat([results_stacking_imbalanced_single, stacked_preds['results']])

In [66]:
results_stacking_imbalanced_single

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, with LogisticRegression",0.986757,0.931034,0.916667,0.923795,0.006517,0.083333,0.992668
"Algorithms: dt, with LogisticRegression",0.981351,0.915309,0.867284,0.89065,0.007701,0.132716,0.983157
"Algorithms: rf, with LogisticRegression",0.982703,0.933333,0.864198,0.897436,0.005924,0.135802,0.993216
"Algorithms: gb, with LogisticRegression",0.98973,0.944099,0.938272,0.941176,0.005332,0.061728,0.995922
"Algorithms: nb, with LogisticRegression",0.968649,0.879562,0.743827,0.80602,0.009775,0.256173,0.984317
"Algorithms: lr, with RandomForestClassifier",0.987027,0.928571,0.92284,0.925697,0.006813,0.07716,0.993107
"Algorithms: dt, with RandomForestClassifier",0.98027,0.928328,0.839506,0.881686,0.00622,0.160494,0.985262
"Algorithms: rf, with RandomForestClassifier",0.982703,0.936242,0.861111,0.897106,0.005628,0.138889,0.994269
"Algorithms: gb, with RandomForestClassifier",0.989459,0.93578,0.944444,0.940092,0.00622,0.055556,0.997452
"Algorithms: nb, with RandomForestClassifier",0.974054,0.875,0.820988,0.847134,0.011256,0.179012,0.986173


The results were somewhat consistent with those of the imbalanced dataset. Stacking the Gradient Boosting classifiers from the two different feature sets achieved better performance than the merged dataset.

In [67]:
results_stacking_imbalanced_full = results_stacking_imbalanced_single.copy()

In [68]:
results_stacking_imbalanced_best = results_stacking_imbalanced_single.sort_values(by=['F1 Score'], ascending = [False]).head(6)

### Multi-algorithm

In [69]:
results_stacking_imbalanced_multi = pd.DataFrame()

#### Logistic Regression

In [70]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with LogisticRegression")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

#### Random Forest

In [71]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with RandomForestClassifier")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

#### Gradient Boosting

In [72]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=[])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], result_row_name="Algorithms: all, with GradientBoostingClassifier")
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'])
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'])
results_stacking_imbalanced_multi = pd.concat([results_stacking_imbalanced_multi, stacked_preds['results']])

In [73]:
results_stacking_imbalanced_multi

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression",0.989459,0.943925,0.935185,0.939535,0.005332,0.064815,0.995641
"Algorithms: rf, lr, gb, with LogisticRegression",0.99,0.94704,0.938272,0.942636,0.005036,0.061728,0.995882
"Algorithms: rf, gb, with LogisticRegression",0.989189,0.940994,0.935185,0.93808,0.005628,0.064815,0.995323
"Algorithms: lr, gb, with LogisticRegression",0.99027,0.947205,0.941358,0.944272,0.005036,0.058642,0.996256
"Algorithms: rf, lr, with LogisticRegression",0.987838,0.937304,0.92284,0.930016,0.005924,0.07716,0.995168
"Algorithms: all, with RandomForestClassifier",0.991351,0.953416,0.947531,0.950464,0.004443,0.052469,0.997648
"Algorithms: rf, lr, gb, with RandomForestClassifier",0.99027,0.95,0.938272,0.944099,0.004739,0.061728,0.994959
"Algorithms: rf, gb, with RandomForestClassifier",0.99,0.952681,0.932099,0.942278,0.004443,0.067901,0.99726
"Algorithms: lr, gb, with RandomForestClassifier",0.991081,0.953271,0.944444,0.948837,0.004443,0.055556,0.997355
"Algorithms: rf, lr, with RandomForestClassifier",0.988378,0.946032,0.919753,0.932707,0.005036,0.080247,0.994868


Of course, these models performed better on average than the stacking only of different feature sets. On the imbalanced dataset, Naive Bayes and Decision Tree did have some impact on the prediction accuracy. The best results were achieved when all algorithms were used as level 0 classifiers and Gradient Boosting or Random Forest were used on level 1.

In [74]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_multi.sort_values(by=['F1 Score'], ascending = [False]).head(12)])

In [75]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_multi])

### Appending all features

In [76]:
results_stacking_imbalanced_append = pd.DataFrame()

#### Logistic Regression

In [77]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with LogisticRegression (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

#### Random Forest

In [78]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

#### Gradient Boosting

In [79]:
stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=[], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=[], append_features=True, result_row_name="Algorithms: all, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(stacking_models_imbalanced, train_feature_sets_imbalanced, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(stacking_models_imbalanced, test_feature_sets_imbalanced, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
results_stacking_imbalanced_append = pd.concat([results_stacking_imbalanced_append, stacked_preds['results']])

In [80]:
results_stacking_imbalanced_append

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: all, with LogisticRegression (with appended features)",0.975405,0.892256,0.817901,0.853462,0.009479,0.182099,0.986839
"Algorithms: rf, lr, gb, with LogisticRegression (with appended features)",0.975135,0.871795,0.839506,0.855346,0.011848,0.160494,0.977626
"Algorithms: rf, gb, with LogisticRegression (with appended features)",0.975405,0.865204,0.851852,0.858476,0.012737,0.148148,0.985631
"Algorithms: lr, gb, with LogisticRegression (with appended features)",0.977568,0.88746,0.851852,0.869291,0.010367,0.148148,0.986768
"Algorithms: rf, lr, with LogisticRegression (with appended features)",0.972703,0.863192,0.817901,0.839937,0.012441,0.182099,0.978714
"Algorithms: all, with RandomForestClassifier (with appended features)",0.99027,0.964516,0.92284,0.943218,0.003258,0.07716,0.997298
"Algorithms: rf, lr, gb, with RandomForestClassifier (with appended features)",0.990811,0.958861,0.935185,0.946875,0.003851,0.064815,0.997661
"Algorithms: rf, gb, with RandomForestClassifier (with appended features)",0.986486,0.956667,0.885802,0.919872,0.003851,0.114198,0.995491
"Algorithms: lr, gb, with RandomForestClassifier (with appended features)",0.99,0.964401,0.919753,0.941548,0.003258,0.080247,0.997247
"Algorithms: rf, lr, with RandomForestClassifier (with appended features)",0.99,0.961415,0.92284,0.941732,0.003555,0.07716,0.997043


Adding the initial feature sets to the final classifier also mostly harms performance on the imbalanced dataset. The best performing model now only barely performed better than without the features.

In [81]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_append.sort_values(by=['F1 Score'], ascending = [False]).head(6)])

In [82]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_append])

### Merged Classifiers

In [83]:
train_feature_sets_imbalanced_merged = [{'name': 'merge', 'features': style_content_train_imbalanced}]
test_feature_sets_imbalanced_merged = [{'name': 'merge', 'features': style_content_test_imbalanced}]

In [84]:
lr_merged_imbalanced = {'model' : lr_style_content_imbalanced, 'scaler': lr_style_content_imbalanced_scaler}
nb_merged_imbalanced = {'model' : nb_style_content_imbalanced, 'scaler': nb_style_content_imbalanced_scaler}

merged_models_imbalanced = [{'name' : 'lr', 'features' : 'merge', 'model' : lr_merged_imbalanced},
                          {'name' : 'dt', 'features' : 'merge', 'model' : dt_style_content_imbalanced},
                          {'name' : 'rf', 'features' : 'merge', 'model' : rf_style_content_imbalanced},
                          {'name' : 'gb', 'features' : 'merge', 'model' : gb_style_content_imbalanced},
                          {'name' : 'nb', 'features' : 'merge', 'model' : nb_merged_imbalanced}]

In [85]:
results_stacking_imbalanced_merged = pd.DataFrame()

#### Logistic Regression

In [86]:
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with LogisticRegression")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with LogisticRegression (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

#### Random Forest

In [87]:
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=rf, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with RandomForestClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

#### Gradient Boosting

In [88]:
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=False, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=False, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=False, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=False)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=False, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

# Append features
stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb'], append_features=True, result_row_name="Algorithms: lr, rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'lr'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'lr'], append_features=True, result_row_name="Algorithms: rf, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'rf'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'rf'], append_features=True, result_row_name="Algorithms: lr, gb merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

stacked_clf = ml.train_stacked_models(merged_models_imbalanced, train_feature_sets_imbalanced_merged, style_train_imbalanced['target'], final_classifier=gb, exclude_models=['dt', 'nb', 'gb'], append_features=True)
stacked_preds = ml.test_stacked_models(merged_models_imbalanced, test_feature_sets_imbalanced_merged, style_test_imbalanced['target'], stacked_clf, exclude_models=['dt', 'nb', 'gb'], append_features=True, result_row_name="Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)")
results_stacking_imbalanced_merged = pd.concat([results_stacking_imbalanced_merged, stacked_preds['results']])

In [89]:
results_stacking_imbalanced_merged

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: lr, rf, gb merged, with LogisticRegression",0.99,0.955556,0.929012,0.942097,0.004147,0.070988,0.998106
"Algorithms: rf, gb merged, with LogisticRegression",0.987568,0.931677,0.925926,0.928793,0.006517,0.074074,0.996373
"Algorithms: lr, gb merged, with LogisticRegression",0.989459,0.949527,0.929012,0.939158,0.004739,0.070988,0.998186
"Algorithms: rf, lr merged, with LogisticRegression",0.990811,0.964744,0.929012,0.946541,0.003258,0.070988,0.997569
"Algorithms: lr, rf, gb merged, with LogisticRegression (with appended features)",0.977297,0.892157,0.842593,0.866667,0.009775,0.157407,0.987852
"Algorithms: rf, gb merged, with LogisticRegression (with appended features)",0.976486,0.881029,0.845679,0.862992,0.01096,0.154321,0.985168
"Algorithms: lr, gb merged, with LogisticRegression (with appended features)",0.976757,0.891447,0.83642,0.863057,0.009775,0.16358,0.984063
"Algorithms: rf, lr merged, with LogisticRegression (with appended features)",0.977027,0.881789,0.851852,0.866562,0.01096,0.148148,0.985733
"Algorithms: lr, rf, gb merged, with RandomForestClassifier",0.988378,0.932308,0.935185,0.933744,0.006517,0.064815,0.997964
"Algorithms: rf, gb merged, with RandomForestClassifier",0.986486,0.92284,0.92284,0.92284,0.007405,0.07716,0.996386


This did not perform as consistently as on the balanced dataset, but a model managed to outperform everything else.

In [90]:
results_stacking_imbalanced_best = pd.concat([results_stacking_imbalanced_best, results_stacking_imbalanced_merged.sort_values(by=['F1 Score'], ascending = [False]).head(10)])

In [91]:
results_stacking_imbalanced_full = pd.concat([results_stacking_imbalanced_full, results_stacking_imbalanced_merged])

In [92]:
results_stacking_imbalanced_best.sort_values(by=['F1 Score'], ascending = [False]).head(6)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
"Algorithms: rf, lr merged, with GradientBoostingClassifier (with appended features)",0.991622,0.956386,0.947531,0.951938,0.004147,0.052469,0.997899
"Algorithms: lr, gb, with GradientBoostingClassifier (with appended features)",0.991351,0.950617,0.950617,0.950617,0.004739,0.049383,0.998202
"Algorithms: all, with GradientBoostingClassifier (with appended features)",0.991351,0.953416,0.947531,0.950464,0.004443,0.052469,0.998227
"Algorithms: rf, lr, gb, with GradientBoostingClassifier (with appended features)",0.991351,0.953416,0.947531,0.950464,0.004443,0.052469,0.998245
"Algorithms: all, with RandomForestClassifier",0.991351,0.953416,0.947531,0.950464,0.004443,0.052469,0.997648
"Algorithms: all, with GradientBoostingClassifier",0.991081,0.944954,0.953704,0.949309,0.005332,0.046296,0.994784
