In [15]:
import pandas as pd
from imblearn.over_sampling import SMOTE
# make a prediction with a stacking ensemble
# compare ensemble to each baseline classifier
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [16]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.utils import column_or_1d

# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression(max_iter=1000)))
    level0.append(('lda', LinearDiscriminantAnalysis()))
    level0.append(('bayes', GaussianNB()))
    level0.append(('svm', SVC()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('rf', RandomForestClassifier()))


    # define meta learner model
    level1 = DecisionTreeClassifier()
    # define the stacking ensemble | cv = cross-validation
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


In [21]:
from sklearn import preprocessing
import np as np
from sklearn.model_selection import train_test_split, StratifiedKFold
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.utils import column_or_1d


# get the dataset
def get_stratified_dataset():

    #==============================================
    # LOAD DATASET                                |
    #==============================================

    df = pd.read_csv ('/Users/imarcal/Library/CloudStorage/OneDrive-Pessoal/Documentos/Unesp/Doutorado/stacking/Models creation/Single Project Training/Single-model/Commons_cli_ok/dataset.csv')
    df.head()

    #==============================================
    # SPLIT DATASET                               |
    #==============================================
    X = df.iloc[:, 0:6]
    y = df.iloc[:, 6:7]

    # Feature Scaling for input features.
    scaler = preprocessing.MinMaxScaler()
    x_scaled = scaler.fit_transform(X)

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

    for train_index, test_index in kfold.split(X, y):
        # select rows and columns
        x_train, x_test = x_scaled[train_index], x_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # summarize train and test composition
        train_0, train_1 = y_train['buggy'].value_counts()[0], y_train['buggy'].value_counts()[1]
        test_0, test_1 = y_test['buggy'].value_counts()[0], y_test['buggy'].value_counts()[1]
        print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

    return x_train, x_test, y_train, y_test

In [22]:
from sklearn.metrics import make_scorer, precision_score, recall_score, average_precision_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate

model = get_stacking()

X_train_test, X_gtest, y_train_test, y_gtest = get_stratified_dataset()
y_train_test_1d = np.array(y_train_test).ravel()

scoring = {'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, average='weighted', zero_division=1),
           'recall': make_scorer(recall_score, average='weighted'),
           'f1': make_scorer(f1_score, average='weighted'),
           'roc_auc': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
           'log_loss': 'neg_log_loss'
           }

cv_results = cross_validate(model, X_train_test, y_train_test_1d, cv=10, scoring=scoring, return_estimator=True)
cv_results

>Train: 0=1334, 1=59, Test: 0=149, 1=6
>Train: 0=1334, 1=59, Test: 0=149, 1=6
>Train: 0=1334, 1=59, Test: 0=149, 1=6
>Train: 0=1335, 1=58, Test: 0=148, 1=7
>Train: 0=1335, 1=58, Test: 0=148, 1=7
>Train: 0=1335, 1=58, Test: 0=148, 1=7
>Train: 0=1335, 1=58, Test: 0=148, 1=7
>Train: 0=1335, 1=58, Test: 0=148, 1=7
>Train: 0=1335, 1=59, Test: 0=148, 1=6
>Train: 0=1335, 1=59, Test: 0=148, 1=6


{'fit_time': array([0.65830803, 0.445889  , 0.43300295, 0.43548703, 0.42700124,
        0.42948294, 0.43109417, 0.43076181, 0.48580813, 0.42917585]),
 'score_time': array([0.01229906, 0.00936198, 0.00932479, 0.00910997, 0.00932193,
        0.00922394, 0.00907278, 0.00917006, 0.00908303, 0.00910211]),
 'estimator': [StackingClassifier(cv=5,
                     estimators=[('lr', LogisticRegression(max_iter=1000)),
                                 ('lda', LinearDiscriminantAnalysis()),
                                 ('bayes', GaussianNB()), ('svm', SVC()),
                                 ('knn', KNeighborsClassifier()),
                                 ('rf', RandomForestClassifier())],
                     final_estimator=DecisionTreeClassifier()),
  StackingClassifier(cv=5,
                     estimators=[('lr', LogisticRegression(max_iter=1000)),
                                 ('lda', LinearDiscriminantAnalysis()),
                                 ('bayes', GaussianNB()), ('svm

In [23]:
print('precision', cv_results['test_precision'].mean())
print('recall', cv_results['test_recall'].mean())
print('accuracy', cv_results['test_accuracy'].mean())
print('F1', cv_results['test_f1'].mean())
print('roc auc', cv_results['test_roc_auc'].mean())
print('log loss', cv_results['test_log_loss'].mean())

precision 0.9603723870564342
recall 0.9497995889003082
accuracy 0.9497995889003082
F1 0.9535051194648304
roc auc 0.7396594097183257
log loss -1.8094062176684762


In [24]:
gtest_score = []
for i in range(len(cv_results['estimator'])):
    gtest_score.append(cv_results['estimator'][i].score(X_gtest, y_gtest))

sum(gtest_score) / len(gtest_score)

0.9610389610389609

In [25]:
import numpy as np

result_proba = cv_results['estimator'][0].predict_proba(X_gtest.iloc[:1])
for i in range(1, len(cv_results['estimator'])):
    result_proba = np.add(result_proba, cv_results['estimator'][i].predict_proba(X_gtest))

result_proba = result_proba/10
print(np.array2string(result_proba, formatter={'float_kind':lambda x: "%.3f" % x}))

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'