In [40]:

# make a prediction with a stacking ensemble
# compare ensemble to each baseline classifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [42]:
import np as np
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# get the dataset
def get_dataset():

    #==============================================
    # LOAD DATASET                                |
    #==============================================

    global X_train, X_test, y_train, y_test
    df = pd.read_csv ('C:/Users/inmar/OneDrive/Documentos/Unesp/Doutorado/stacking/Models creation/Single Project Training/Single-model/Commons_compress/dataset.csv')
    df.head()

    #==============================================
    # SPLIT DATASET                               |
    #==============================================
    X = df.iloc[:, 0:6]
    y = df.iloc[:, 6:7]

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    for train_index, test_index in kfold.split(X, y):
        # select rows and columns
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # summarize train and test composition
        train_0, train_1 = y_train['buggy'].value_counts()[0], y_train['buggy'].value_counts()[1]
        test_0, test_1 = y_test['buggy'].value_counts()[0], y_test['buggy'].value_counts()[1]
        print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

    return X_train, X_test, y_train, y_test

In [43]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = get_dataset()
y_train_test_1d = np.array(y_train).ravel()

scoring = {'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, average='weighted', zero_division=1),
           'recall': make_scorer(recall_score, average='weighted'),
           'f1': make_scorer(f1_score, average='weighted'),
           'roc_auc': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
           'log_loss': 'neg_log_loss',
           }

level0 = list()
level0.append(('lr', LogisticRegression(max_iter=1000, random_state=42)))
level0.append(('lda', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')))
level0.append(('rf', RandomForestClassifier(n_estimators=100, random_state=42)))
level0.append(('bayes', GaussianNB()))
level0.append(('svm', SVC(probability=True, random_state=42)))
level0.append(('cart', DecisionTreeClassifier(max_depth=5, min_samples_split=10, criterion='entropy')))
level0.append(('knn', KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='kd_tree')))

# Create the voting classifier
vc = VotingClassifier(estimators=level0, voting='soft')

cv_results = cross_validate(vc, X_train, y_train_test_1d, cv=10, scoring=scoring, return_estimator=True)
cv_results

>Train: 0=6691, 1=404, Test: 0=744, 1=45
>Train: 0=6691, 1=404, Test: 0=744, 1=45
>Train: 0=6691, 1=404, Test: 0=744, 1=45
>Train: 0=6691, 1=404, Test: 0=744, 1=45
>Train: 0=6691, 1=405, Test: 0=744, 1=44
>Train: 0=6692, 1=404, Test: 0=743, 1=45
>Train: 0=6692, 1=404, Test: 0=743, 1=45
>Train: 0=6692, 1=404, Test: 0=743, 1=45
>Train: 0=6692, 1=404, Test: 0=743, 1=45
>Train: 0=6692, 1=404, Test: 0=743, 1=45


{'fit_time': array([1.66606927, 1.4778831 , 1.67619228, 1.7235837 , 1.57557726,
        1.64377546, 1.59041214, 1.8066659 , 1.60743666, 1.51054311]),
 'score_time': array([0.11853552, 0.12755203, 0.13951874, 0.13403726, 0.14853072,
        0.1452961 , 0.15061879, 0.1320169 , 0.11559129, 0.12365437]),
 'estimator': [VotingClassifier(estimators=[('lr',
                                LogisticRegression(max_iter=1000,
                                                   random_state=42)),
                               ('lda',
                                LinearDiscriminantAnalysis(shrinkage='auto',
                                                           solver='lsqr')),
                               ('rf', RandomForestClassifier(random_state=42)),
                               ('bayes', GaussianNB()),
                               ('svm', SVC(probability=True, random_state=42)),
                               ('cart',
                                DecisionTreeClassifier(criterio

In [44]:
print('precision', cv_results['test_precision'].mean())
print('recall', cv_results['test_recall'].mean())
print('accuracy', cv_results['test_accuracy'].mean())
print('F1', cv_results['test_f1'].mean())
print('roc auc', cv_results['test_roc_auc'].mean())
print('log loss', cv_results['test_log_loss'].mean())

precision 0.9463086373057704
recall 0.9430668070482131
accuracy 0.9430668070482131
F1 0.9154344218767234
roc auc 0.7217548457537316
log loss -0.2203156898818409


In [49]:
import numpy as np

result_proba = cv_results['estimator'][0].predict_proba(X_test.iloc[:1])
for i in range(1, len(cv_results['estimator'])):
    result_proba = np.add(result_proba, cv_results['estimator'][i].predict_proba(X_test))

result_proba = result_proba/10
print(np.array2string(result_proba, formatter={'float_kind':lambda x: "%.3f" % x}))

Unnamed: 0,number_of_authors,age,number_unique_changes,size,lines_added,lines_deleted
19,1,0,1,120,119,0
26,2,124257,2,69,14,59
34,1,137253,2,425,14,59
42,1,18829,3,473,8,1
53,2,526177,3,117,1,1
...,...,...,...,...,...,...
7817,6,938425,8,57,0,1
7835,20,11036,213,1581,0,1
7841,8,119496,27,102,0,1
7861,5,253330,11,48,0,1


In [47]:
# Assuming y_test contains the actual labels of the test set
y_pred = (result_proba[:, 1] > 0.5).astype(int)
y_pred

# auc_score = roc_auc_score(y_test, result_proba[:, 1])
# accuracy = (y_test == y_pred).mean()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [51]:
import joblib

joblib.dump(cv_results, 'cv_results.pkl')

['cv_results.pkl']