In [69]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sqlite3
from collections import Counter
from tqdm.autonotebook import tqdm
import warnings

# processing and pre-processing imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.exceptions import DataConversionWarning
from sklearn.exceptions import ConvergenceWarning

# ml model selection imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# ml models
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

import machumachine

# metric computation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn import metrics

# ensemble models
from mlxtend.classifier import StackingClassifier
from mlxtend.classifier import StackingCVClassifier

In [70]:
X = pd.read_pickle('results/2/X.pkl').values
y = pd.read_pickle('results/2/target.pkl').values

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [72]:
xgboost = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               learning_rate=0.1, max_delta_step=0, max_depth=3, max_features=3,
               min_child_weight=1, min_samples_leaf=3, n_estimators=100, n_jobs=1, nthread=None,
               objective='multi:softprob', random_state=0, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
               subsample=1, verbosity=1)
lr = LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False)
gbc = GradientBoostingClassifier(criterion='friedman_mse', init=None,
                            learning_rate=0.5, loss='deviance', max_depth=3,
                            max_features=5, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=5, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=400,
                            n_iter_no_change=None, presort='auto',
                            random_state=None, subsample=1.0, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False)
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=6, max_features=5, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=600,
                        n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                        warm_start=False)
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                      weights='uniform')
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                        max_features=3, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best')

models = [xgboost, lr, gbc, rf, knn, dt]

In [87]:
# meta classifier
mc = xgb.XGBClassifier() 
    
# stacking classifier
sclf = StackingCVClassifier(classifiers=models[:6],
                            shuffle=False,
                            use_probas=True,
                            cv=5,
                            meta_classifier=mc)


classifier_array = models
classifier_array.append(sclf)
labels = [clf.__class__.__name__ for clf in classifier_array]

In [88]:
clf_dict = dict(zip(labels, classifier_array))

In [89]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    for name in clf_dict.keys():
        print(name)
        ml_model = clf_dict[name]

        ml_model.fit(X_train, y_train)

        clf_dict[name] = ml_model

XGBClassifier
LogisticRegression
GradientBoostingClassifier
RandomForestClassifier
KNeighborsClassifier
DecisionTreeClassifier
StackingCVClassifier


In [90]:
results = pd.DataFrame()
for key in clf_dict.keys():
    # Make prediction on test set
    y_pred = clf_dict[key].predict(X_test)
    
    # Save results in pandas dataframe object
    results[f"{key}"] = y_pred

# Add the test set to the results object
results["Target"] = y_test

In [91]:
accuracy_score(results['Target'], results['StackingCVClassifier'])

0.7073170731707317

In [92]:
for i in results.columns[:-1]:
    print(i, accuracy_score(results['Target'], results[i]))

XGBClassifier 0.7235772357723578
LogisticRegression 0.6504065040650406
GradientBoostingClassifier 0.6422764227642277
RandomForestClassifier 0.573170731707317
KNeighborsClassifier 0.5447154471544715
DecisionTreeClassifier 0.46747967479674796
StackingCVClassifier 0.7073170731707317


In [93]:
def meta_class(row):
    return Counter(np.array(row)).most_common(1)[0][0]

In [94]:
accuracy_score(y_test, results.iloc[:,:-1].apply(meta_class, axis=1))

0.6991869918699187

In [95]:
from itertools import combinations

In [96]:
combi_models = [[i[0],i[1],i[2]] for i in combinations(models,3)]

## Iterate Stacking for Multiple Model-Pairs

In [97]:
all_results = []
for i in tqdm(combi_models):
    sclf = StackingCVClassifier(classifiers=i,
                            shuffle=False,
                            use_probas=False,
                            cv=5,
                            meta_classifier=xgb.XGBClassifier())


    classifier_array = i
    classifier_array.append(sclf)
    labels = [clf.__class__.__name__ for clf in classifier_array]
    
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        for name in clf_dict.keys():
            ml_model = clf_dict[name]

            ml_model.fit(X_train, y_train)

            clf_dict[name] = ml_model
            
    results = pd.DataFrame()
    for key in clf_dict.keys():
        # Make prediction on test set
        y_pred = clf_dict[key].predict(X_test)

        # Save results in pandas dataframe object
        results[f"{key}"] = y_pred

    # Add the test set to the results object
    results["Target"] = y_test
    
    all_results.append(results)

HBox(children=(IntProgress(value=0, max=56), HTML(value='')))




In [98]:
accuracies = [accuracy_score(y_test, i['StackingCVClassifier']) for i in all_results]
np.array(accuracies).max()

0.7317073170731707

In [101]:
new_acc = []
for i in all_results:
    if len(i.columns) > 4:
        new_acc.append(accuracy_score(y_test, i.iloc[:,:-2].apply(meta_class, axis=1)))
    else:
        pass

In [102]:
np.array(new_acc).max()

0.7398373983739838

In [103]:
np.argmax(np.array(new_acc))

18

In [109]:
print(classification_report(y_test, all_results[18].apply(meta_class, axis=1)))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77       104
           1       0.72      0.84      0.77       114
           2       0.84      0.57      0.68        28

    accuracy                           0.76       246
   macro avg       0.79      0.71      0.74       246
weighted avg       0.77      0.76      0.76       246



In [110]:
confusion_matrix(y_test, all_results[18].apply(meta_class, axis=1))

array([[76, 28,  0],
       [15, 96,  3],
       [ 2, 10, 16]])