In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
# make a prediction with a stacking ensemble
# compare ensemble to each baseline classifier
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.utils import column_or_1d


# get the dataset
def get_dataset():

    #==============================================
    # LOAD DATASET                                |
    #==============================================

    df = pd.read_csv ('/Users/imarcal/Library/CloudStorage/OneDrive-Pessoal/Documentos/Unesp/Doutorado/stacking/Models creation/Single Project Training/Single-model/Commons_cli_ok/dataset.csv')
    df.head()

    #==============================================
    # SPLIT DATASET                               |
    #==============================================
    X = df.iloc[:, 0:6]
    y = df.iloc[:, 6:7]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    #==============================================
    # TREAT IMBALANCE                             |
    #==============================================
    #SMOTE strategy
    from imblearn.over_sampling import SMOTE

    # Resample the minority class. You can change the strategy to 'auto' if you are not sure.
    sm = SMOTE(sampling_strategy='minority', random_state=7)

    # Fit the model to generate the data. - Oversampling the training data after split
    oversampled_trainX, oversampled_trainY = sm.fit_resample(X_train, y_train)
    print(len(oversampled_trainX))
    print(len(oversampled_trainY))

    return oversampled_trainX, oversampled_trainY, X_test, y_test


# get a stacking ensemble of models
def get_model():
    cart_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=10, criterion='entropy')
    return cart_clf

# evaluate a give model using cross-validation
def evaluate_model(model, X, y, scoring):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=1)
    scores = cross_validate(model, X, y, scoring=scoring, cv=cv, n_jobs=20, error_score='raise', return_estimator=True)
    return scores


In [3]:
import numpy as np
from sklearn.metrics import make_scorer, precision_score, recall_score, average_precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate

model = get_model()



oversampled_trainX, oversampled_trainY, X_test, y_test = get_dataset()
y_train_test_1d = np.array(oversampled_trainY).ravel()

scoring = {'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, average='weighted', zero_division=1),
           'recall': make_scorer(recall_score, average='weighted'),
           'f1': make_scorer(f1_score, average='weighted'),
           'roc_auc': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
           'log_loss': 'neg_log_loss'
           }

score_results = evaluate_model(model, oversampled_trainX, y_train_test_1d, scoring)
score_results

2076
2076


{'fit_time': array([0.00866604, 0.09833717, 0.00263596, 0.00869989, 0.00252891,
        0.00235176, 0.00239015, 0.00710702, 0.0078218 , 0.01134706,
        0.00262499, 0.00253296, 0.00442505, 0.031075  , 0.00242591,
        0.00517106, 0.02910113, 0.00251198, 0.00367379, 0.00313306,
        0.06984091, 0.00736475, 0.00468612, 0.00261569, 0.00235391,
        0.06222892, 0.00639582, 0.00195599, 0.0048461 , 0.00293922,
        0.00295496, 0.07548618, 0.00438309, 0.00642991, 0.020365  ,
        0.00809121, 0.00880098, 0.00210118, 0.00551915, 0.00463796,
        0.00371885, 0.0051899 , 0.00208282, 0.00201702, 0.00549603,
        0.00472903, 0.01945972, 0.00279808, 0.00200009, 0.00432801,
        0.03781581, 0.00445509, 0.00362802, 0.00296903, 0.00414705,
        0.01070881, 0.00222206, 0.00655794, 0.0021131 , 0.00575686,
        0.00481725, 0.00826907, 0.00416827, 0.00316882, 0.00211   ,
        0.00588703, 0.00354195, 0.00209904, 0.002918  , 0.00242591,
        0.00224185, 0.00337195, 0.00

In [4]:
print('precision', score_results['test_precision'].mean())
print('recall', score_results['test_recall'].mean())
print('accuracy', score_results['test_accuracy'].mean())
print('F1', score_results['test_f1'].mean())
print('roc auc', score_results['test_roc_auc'].mean())
print('log loss', score_results['test_log_loss'].mean())

precision 0.9763251819164444
recall 0.9755062012263099
accuracy 0.9755062012263099
F1 0.9754929115151318
roc auc 0.9871047634392773
log loss -0.1921048422135521
