In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

#### Importing csv file

In [3]:
fname = "../input/creditcard.csv"
df = pd.read_csv(fname)

**Converting dataframe to array and splitting into training and test datasets**

In [4]:
features = np.array(df.columns[:-1])
label = np.array(df.columns[-1])
data = df.values
X = data[: , :-1]
y = data[: , -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [5]:
kf = KFold(n_splits = 5)

**ENSEMBLE START**

In [None]:
# function to display scores for classifier clf, with training data x_tr, y_tr and test data
# x_te, y_te
def scores(clf, x_tr, y_tr, x_te, y_te):
    clf.fit(x_tr, y_tr)
    pred = clf.predict(x_te)
    print('Precision:',precision_score(y_te, pred))
    print('Recall:',recall_score(y_te, pred))
    print('F1:',f1_score(y_te, pred))
    print('Confusion Matrix (tn, fp, fn, tp):',confusion_matrix(y_te, pred).ravel())

In [8]:
# function to return metafeatures for classifier clf, data x, labels y with kf k-fold 
def get_metafeatures(clf, x, y, kf):
    meta_feat = np.zeros((len(x),))
    for train_index, test_index in kf.split(x, y):
        X_tr, X_te = x[train_index], x[test_index]
        y_tr, y_te = y[train_index], y[test_index]
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_te)
        meta_feat[test_index] = y_pred
    return meta_feat.reshape(-1, 1)

In [9]:
# best KNN classifier after hyperparameter tuning based on F1 score
b_knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
pred_train_knn = get_metafeatures(b_knn, X_train, y_train, kf)
pred_test_knn = get_metafeatures(b_knn, X_test, y_test, kf)
scores(b_knn, X_train, y_train, X_test, y_test)

Precision: 0.4838709677419355
Recall: 0.20134228187919462
F1: 0.28436018957345977
Confusion Matrix (tn, fp, fn, tp): [93806    32   119    30]


**KNN Scores**
* Precision: 0.4838709677419355
* Recall: 0.20134228187919462
* F1: 0.28436018957345977
* Confusion Matrix (tn, fp, fn, tp): [93806    32   119    30]

In [10]:
# best RF classifier after hyperparameter tuning based on F1 score
b_rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=42, verbose=0,
            warm_start=False) 
pred_train_rf = get_metafeatures(b_rf, X_train, y_train, kf) 
pred_test_rf = get_metafeatures(b_rf, X_test, y_test, kf)
scores(b_rf, X_train, y_train, X_test, y_test)


Precision: 0.9426229508196722
Recall: 0.7718120805369127
F1: 0.8487084870848709
Confusion Matrix (tn, fp, fn, tp): [93831     7    34   115]


**RF Scores**
* Precision: 0.9426229508196722
* Recall: 0.7718120805369127
* F1: 0.8487084870848709
* Confusion Matrix (tn, fp, fn, tp): [93831     7    34   115]

In [11]:
# best XGB classifier after hyperparameter tuning based on F1 score
b_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=210,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
pred_train_xgb = get_metafeatures(b_xgb, X_train, y_train, kf)
pred_test_xgb = get_metafeatures(b_xgb, X_test, y_test, kf)
scores(b_xgb, X_train, y_train, X_test, y_test)

Precision: 0.953125
Recall: 0.8187919463087249
F1: 0.8808664259927798
Confusion Matrix (tn, fp, fn, tp): [93832     6    27   122]


**XGB Scores**
* Precision: 0.953125
* Recall: 0.8187919463087249
* F1: 0.8808664259927798
* Confusion Matrix (tn, fp, fn, tp): [93832     6    27   122]

In [12]:
# best LR classifier after hyperparameter tuning based on F1 score
b_lr = LogisticRegression(C=0.11, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=78, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
pred_train_lr = get_metafeatures(b_lr, X_train, y_train, kf)
pred_test_lr = get_metafeatures(b_lr, X_test, y_test, kf)
scores(b_lr, X_train, y_train, X_test, y_test)


Precision: 0.8761904761904762
Recall: 0.6174496644295302
F1: 0.7244094488188977
Confusion Matrix (tn, fp, fn, tp): [93825    13    57    92]


**LR Scores**
* Precision: 0.8761904761904762
* Recall: 0.6174496644295302
* F1: 0.7244094488188977
* Confusion Matrix (tn, fp, fn, tp): [93825    13    57    92]

**Appending predictions from base classes to original set of features**

In [13]:
X_train = np.hstack((X_train, pred_train_knn))
X_test = np.hstack((X_test, pred_test_knn))
print(np.shape(X_train), np.shape(X_test))

(190820, 31) (93987, 31)


In [14]:
X_train = np.hstack((X_train, pred_train_rf))
X_test = np.hstack((X_test, pred_test_rf))
print(np.shape(X_train), np.shape(X_test))

(190820, 32) (93987, 32)


In [15]:
X_train = np.hstack((X_train, pred_train_xgb))
X_test = np.hstack((X_test, pred_test_xgb))
print(np.shape(X_train), np.shape(X_test))

(190820, 33) (93987, 33)


In [16]:
X_train = np.hstack((X_train, pred_train_lr))
X_test = np.hstack((X_test, pred_test_lr))
print(np.shape(X_train), np.shape(X_test))

(190820, 34) (93987, 34)


**Running Ensemble XGB model with stacked features**

In [None]:
ens_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=210,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
scores(ens_xgb, X_train, y_train, X_test, y_test)

**Stacked Ensemble XGB Scores**
* Precision: 0.9461538461538461
* Recall: 0.825503355704698
* F1: 0.881720430107527
* Confusion Matrix (tn, fp, fn, tp): [93831     7    26   123]
