In [3]:
### IMPORTS ###
import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_validate

## Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from xgboost import plot_importance

## Evaluation
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [40]:
### FUNCTIONS ###

## Pickle model probabilities
def saveProba(clf, X, name):
    yproba = clf.predict_proba(X)
    path = "dump/proba/" + name
    print("Saved proba to:", path)
    with open(path, "wb") as f:
        pkl.dump(yproba[:,0], f)

## Saves predicted labels to csv file
def saveResult(y_pred, name):
    results_dict = {
        "id": np.arange(1, y_pred.size + 1),
        "prediction": y_pred
    }
    results = pd.DataFrame(results_dict)
    path = "dump/submissions/" + name
    print("Saved submission file to:", path)
    results.to_csv(path, index=False)

## Converts probabilities in df to labels
def getPredictions(X):   
    predictions = np.zeros(X.shape)
    predictions[X > 0.5] = 1
    return pd.DataFrame(predictions, columns=list(X))

In [46]:
# Import data
TRAIN_RPATH = "dump/data/train_vld1.pkl"
with open(TRAIN_RPATH, "rb") as f:
    X_train, X_vld, y_train, y_vld = pkl.load(f)
    
TEST_RPATH = "dump/data/test1.pkl"
with open(TEST_RPATH, "rb") as f:
    X_test = pkl.load(f)

In [43]:
## Stacking: Model prediction probabilities
## Load model probabilities

## Training data
with open("dump/proba/et_train.pkl", "rb") as f:
    et_proba_train = pkl.load(f)

with open("dump/proba/xgb_train.pkl", "rb") as f:
    xgb_proba_train = pkl.load(f)

with open("dump/proba/knn_train.pkl", "rb") as f:
    knn_proba_train = pkl.load(f)

model_proba_train = {
    "ET": et_proba_train,
    "XGB": xgb_proba_train,
    "KNN" : knn_proba_train
}
proba_df_train = pd.DataFrame(model_proba_train)

## Validation data
with open("dump/proba/et_vld.pkl", "rb") as f:
    et_proba_vld = pkl.load(f)

with open("dump/proba/xgb_vld.pkl", "rb") as f:
    xgb_proba_vld = pkl.load(f)

with open("dump/proba/knn_vld.pkl", "rb") as f:
    knn_proba_vld = pkl.load(f)

model_proba_vld = {
    "ET": et_proba_vld,
    "XGB": xgb_proba_vld,
    "KNN" : knn_proba_vld
}
proba_df_vld = pd.DataFrame(model_proba_vld)

In [44]:
## Convert probabilities to labels
pred_df_train = getPredictions(proba_df_train)
pred_df_vld = getPredictions(proba_df_vld)

In [50]:
## Stacking: Majority voting
voting_proba = pred_df_vld.mean(axis=1)
voting_pred = [0 if p <= 0.5 else 1 for p in voting_proba]

print(classification_report(y_vld, voting_pred))
print(confusion_matrix(y_vld, voting_pred))

## Why does it perform poorly?
voting_proba_df = pd.concat([pred_df_vld, voting_proba], axis=1)
print(voting_proba_df.iloc[1:20,:])

              precision    recall  f1-score   support

           0       0.22      0.06      0.09      4291
           1       0.10      0.35      0.16      1368

    accuracy                           0.13      5659
   macro avg       0.16      0.20      0.13      5659
weighted avg       0.19      0.13      0.11      5659

[[ 251 4040]
 [ 895  473]]
     ET  XGB  KNN         0
1   1.0  1.0  1.0  1.000000
2   0.0  0.0  0.0  0.000000
3   1.0  1.0  1.0  1.000000
4   1.0  0.0  1.0  0.666667
5   1.0  1.0  1.0  1.000000
6   1.0  1.0  1.0  1.000000
7   0.0  0.0  0.0  0.000000
8   1.0  1.0  1.0  1.000000
9   1.0  1.0  1.0  1.000000
10  1.0  1.0  1.0  1.000000
11  1.0  1.0  1.0  1.000000
12  1.0  1.0  1.0  1.000000
13  1.0  1.0  1.0  1.000000
14  0.0  0.0  0.0  0.000000
15  1.0  1.0  1.0  1.000000
16  1.0  1.0  1.0  1.000000
17  1.0  1.0  1.0  1.000000
18  1.0  1.0  1.0  1.000000
19  1.0  1.0  1.0  1.000000


In [161]:
## Stacking: Soft averaging
avg_proba_vld = proba_df_vld.mean(axis=1)
avg_pred_vld = pd.Series([0 if p <= 0.5 else 1 for p in avg_proba])

print(classification_report(y_vld, avg_pred_vld))
print(confusion_matrix(y_vld, avg_pred_vld))

## Why are the results so poor?
avg_proba_df = pd.concat([proba_df_vld, avg_proba_vld, avg_pred_vld], axis=1)
print(avg_proba_df.iloc[1:20,:])

              precision    recall  f1-score   support

           0       0.23      0.06      0.10      4291
           1       0.11      0.35      0.16      1368

    accuracy                           0.13      5659
   macro avg       0.17      0.21      0.13      5659
weighted avg       0.20      0.13      0.11      5659

[[ 265 4026]
 [ 886  482]]
          ET       XGB  KNN         0  1
1   0.729519  0.844431  0.8  0.791317  1
2   0.201713  0.118797  0.0  0.106836  0
3   0.964321  0.995205  1.0  0.986509  1
4   0.510816  0.472781  0.6  0.527866  1
5   0.982296  0.996829  1.0  0.993042  1
6   0.986826  0.993241  1.0  0.993355  1
7   0.237310  0.128716  0.2  0.188675  0
8   0.953067  0.912019  0.8  0.888362  1
9   0.732120  0.573693  0.6  0.635271  1
10  0.979065  0.989133  1.0  0.989399  1
11  0.521623  0.785004  1.0  0.768876  1
12  0.904420  0.988995  1.0  0.964472  1
13  0.972584  0.988822  1.0  0.987135  1
14  0.441857  0.007465  0.0  0.149774  0
15  0.691750  0.816730  1.0  0.

In [162]:
## Stacking: Decision tree
tree = DecisionTreeClassifier(max_depth=2, min_samples_leaf=10)
tree.fit(proba_df_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [163]:
## Stacking: Decision tree
# Evaluate performance on vld and validation set
# ROC curve, confusion matrix... (class imbalance)
tree_pred_train = tree.predict(proba_df_train)
tree_pred_vld = tree.predict(proba_df_vld)

print(classification_report(y_train, tree_pred_train))
print(confusion_matrix(y_train, tree_pred_train))

print(classification_report(y_vld, tree_pred_vld))
print(confusion_matrix(y_vld, tree_pred_vld))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93     12717
           1       0.80      0.74      0.77      4257

    accuracy                           0.89     16974
   macro avg       0.86      0.84      0.85     16974
weighted avg       0.89      0.89      0.89     16974

[[11932   785]
 [ 1120  3137]]
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      4291
           1       0.68      0.65      0.66      1368

    accuracy                           0.84      5659
   macro avg       0.79      0.78      0.78      5659
weighted avg       0.84      0.84      0.84      5659

[[3878  413]
 [ 482  886]]


In [170]:
## Stacked: MLP
mlp = MLPClassifier(hidden_layer_sizes=(3,5,5,5,3), max_iter=100, alpha=1e-4,
                    solver='adam', verbose=10, random_state=0,
                    learning_rate_init=0.01)
mlp.fit(proba_df_train, y_train)

Iteration 1, loss = 0.46745831
Iteration 2, loss = 0.31808861
Iteration 3, loss = 0.27843819
Iteration 4, loss = 0.26099776
Iteration 5, loss = 0.24872636
Iteration 6, loss = 0.24257144
Iteration 7, loss = 0.23768028
Iteration 8, loss = 0.23365271
Iteration 9, loss = 0.23675523
Iteration 10, loss = 0.23024300
Iteration 11, loss = 0.22689122
Iteration 12, loss = 0.22635926
Iteration 13, loss = 0.22526589
Iteration 14, loss = 0.22574884
Iteration 15, loss = 0.22372949
Iteration 16, loss = 0.22317104
Iteration 17, loss = 0.22266964
Iteration 18, loss = 0.22083900
Iteration 19, loss = 0.22223246
Iteration 20, loss = 0.22113174
Iteration 21, loss = 0.21990961
Iteration 22, loss = 0.21867005
Iteration 23, loss = 0.21791368
Iteration 24, loss = 0.21817857
Iteration 25, loss = 0.21900352
Iteration 26, loss = 0.21659337
Iteration 27, loss = 0.21590761
Iteration 28, loss = 0.21652977
Iteration 29, loss = 0.21857920
Iteration 30, loss = 0.21545123
Iteration 31, loss = 0.21647542
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(3, 5, 5, 5, 3), learning_rate='constant',
              learning_rate_init=0.01, max_fun=15000, max_iter=100,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=0, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=10,
              warm_start=False)

In [171]:
## Stacking: Decision mlp
# Evaluate performance on vld and validation set
# ROC curve, confusion matrix... (class imbalance)
mlp_pred_train = mlp.predict(proba_df_train)
mlp_pred_vld = mlp.predict(proba_df_vld)

print(classification_report(y_train, mlp_pred_train))
print(confusion_matrix(y_train, mlp_pred_train))

print(classification_report(y_vld, mlp_pred_vld))
print(confusion_matrix(y_vld, mlp_pred_vld))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93     12717
           1       0.81      0.76      0.79      4257

    accuracy                           0.90     16974
   macro avg       0.87      0.85      0.86     16974
weighted avg       0.89      0.90      0.89     16974

[[11957   760]
 [ 1011  3246]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      4291
           1       0.72      0.66      0.69      1368

    accuracy                           0.86      5659
   macro avg       0.81      0.79      0.80      5659
weighted avg       0.85      0.86      0.85      5659

[[3937  354]
 [ 460  908]]


### Stacking
#### Base models
- Different preprocessing steps for different models
- Z-score normalisation for logistic regression and neural networks
- No need for monotonic normalisation functions for tree-based models
- k-NN: Normalisation based on distance measure
- GBMs require parameter tuning! (If learning rate is low it requires more trees)
- Use PCA or t-SNE for preprocessing! -> For k-NN?

#### Evaluation of base models
-  To evaluate correlation between predictions of two models
    1. Hamming distance
    2. Matthew's correlation coefficient

#### Final predictive layer
- Majority voting: Wisdom of crowd
- Logistic regression: Weighing of the models?
- Decision tree:  If this model predicts 1 it is 100% correct else depend on prediction from other model
- Neural network: Learn patterns?

### Questions
- How to use linear regression to blend predictions in a classification task?
- Feature importance sorted by Gini index?
- Evaluation: Use of micro vs macro scores?
- How to deal with class imbalance?

### To-do
1. EDA: PCA and t-SNE

### Thoughts
- Models find it easy to differentiate one subset of negative samples