In [1]:
import pandas as pd
import numpy as np
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import StratifiedKFold
import prune
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import glob

In [2]:
path ='data' # use your path
allFiles = glob.glob(path + "/*.dat")

# Base Ensembles

In [3]:
Dataset = ['haberman.dat', 'glass-0-1-2-3_vs_4-5-6.dat', 'ecoli2.dat', 'vehicle1.dat', 'page-blocks0.dat', 'glass0.dat', 'glass1.dat', 'ecoli3.dat', 'wisconsin.dat']

In [4]:
base_ensembles_AUC = np.zeros(5)
base_ensembles_GM = np.zeros(5)
for fileName in Dataset:
    data = np.genfromtxt(fname = 'data/'+fileName, comments='@', delimiter=',', autostrip=True)
    X = data[:,:-1]
    data = pd.read_csv('data/'+fileName, comment='@', header = None, delimiter=',', delim_whitespace=True)
    y = data.iloc[:,-1].values

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    #         SMOTE
        smote = SMOTE()
        X_sm, y_sm = smote.fit_resample(X_train, y_train)
    #         Random Under Sample
        rus = RandomUnderSampler()
        X_rus, y_rus = rus.fit_resample(X_train, y_train)
        
        bagging = BaggingClassifier(n_estimators=40)
        boost = AdaBoostClassifier(n_estimators=10)
    #---------------SMOTE-Bagging---------------#
        bagging.fit(X_sm, y_sm)
        base_ensembles_AUC[0] += roc_auc_score(y_test, bagging.predict(X_test))
        base_ensembles_GM[0] += geometric_mean_score(y_test, bagging.predict(X_test))

    #---------------Under-Bagging---------------#
        bagging.fit(X_rus, y_rus)
        base_ensembles_AUC[1] += roc_auc_score(y_test, bagging.predict(X_test))
        base_ensembles_GM[1] += geometric_mean_score(y_test, bagging.predict(X_test))
        
    #---------------SMOTE-Boost---------------#
        boost.fit(X_sm, y_sm)
        base_ensembles_AUC[2] += roc_auc_score(y_test, boost.predict(X_test))
        base_ensembles_GM[2] += geometric_mean_score(y_test, boost.predict(X_test))
        
    #---------------RUS-Boost---------------#
        rus_boost = RUSBoostClassifier(n_estimators=10)
        rus_boost.fit(X_train, y_train)
        base_ensembles_AUC[3] += roc_auc_score(y_test, rus_boost.predict(X_test))
        base_ensembles_GM[3] += geometric_mean_score(y_test, rus_boost.predict(X_test))
        
    #---------------EasyEnsemble---------------#
        easy = EasyEnsembleClassifier(n_estimators= 10)
        easy.fit(X_train, y_train)
        base_ensembles_AUC[4] += roc_auc_score(y_test, easy.predict(X_test))
        base_ensembles_GM[4] += geometric_mean_score(y_test, easy.predict(X_test))
base_ensembles_AUC /= ((skf.n_splits)*len(Dataset))
base_ensembles_GM /= ((skf.n_splits)*len(Dataset))

# Construção dos data frames

In [5]:
results = pd.read_csv('RESULTS1.txt', comment='@', skip_blank_lines=True, header=None)

In [6]:
scores = results.iloc[:,:].values

In [7]:
IR = np.array([2.78, 3.2, 5.46, 2.9, 8.79, 2.06, 1.82, 8.6, 1.86])
nump_aux = np.zeros((len(Dataset),25))
nump_aux_gm = np.zeros((len(Dataset),25))
matrix_AUC = pd.DataFrame(nump_aux,index=Dataset)
matrix_GM = pd.DataFrame(nump_aux,index=Dataset)
generator = ['SMOTE-Bagging', 'Under-Bagging', 'SMOTE-Boost', 'RUS-Boost', 'EasyEnsemble']
pruning = ['BB-Imb', 'MDM-Imb', 'Complementarity', 'Kappa', 'RE-GM']
nameCol = []
for dataset, nameD in enumerate(Dataset):
#     print('-----Data set------')
    column_index = 0
    for i, gen in enumerate(generator):
#         print(i+1,gen)
        test_AUC = []
        for j, prun in enumerate(pruning):
#             print(j+1, prun)
            a = 125*dataset + 25*i + 5*j
#             print(scores[a:a+5,:])
            nump_aux[dataset, column_index] = scores[a:a+5,:].mean(axis=0)[0]
            nump_aux_gm[dataset, column_index] = scores[a:a+5,:].mean(axis=0)[1]
            matrix_AUC.loc[nameD, column_index] = str("%.4f" %scores[a:a+5,:].mean(axis=0)[0]) + '±' + str("%.4f" %scores[a:a+5,:].std(axis=0)[0])
            matrix_GM.loc[nameD, column_index] = str("%.4f" %scores[a:a+5,:].mean(axis=0)[1]) + '±' + str("%.4f" %scores[a:a+5,:].std(axis=0)[1])
#             print('Mean:',scores[a:a+5,:].mean(axis=0))
#             print('Std:', scores[a:a+5,:].std(axis=0))
            test_AUC.append(scores[a:a+5,0])
            column_index += 1
            if dataset == 0:
                nameCol.append(gen+'_'+prun)
#         print('Friedman Test AUC:', nonparametric_tests.friedman_aligned_ranks_test(test_AUC[0],test_AUC[1],test_AUC[2],test_AUC[3],test_AUC[4])[2])
matrix_AUC.columns = nameCol
matrix_GM.columns = nameCol

# Tabela completa com todas as possíveis combinações

## AUC

In [8]:
matrix_AUC.insert(0, 'IR', IR)
matrix_AUC

Unnamed: 0,IR,SMOTE-Bagging_BB-Imb,SMOTE-Bagging_MDM-Imb,SMOTE-Bagging_Complementarity,SMOTE-Bagging_Kappa,SMOTE-Bagging_RE-GM,Under-Bagging_BB-Imb,Under-Bagging_MDM-Imb,Under-Bagging_Complementarity,Under-Bagging_Kappa,...,RUS-Boost_BB-Imb,RUS-Boost_MDM-Imb,RUS-Boost_Complementarity,RUS-Boost_Kappa,RUS-Boost_RE-GM,EasyEnsemble_BB-Imb,EasyEnsemble_MDM-Imb,EasyEnsemble_Complementarity,EasyEnsemble_Kappa,EasyEnsemble_RE-GM
haberman.dat,2.78,0.5792±0.0098,0.5612±0.0046,0.5795±0.0160,0.5734±0.0120,0.5734±0.0133,0.5958±0.0111,0.5918±0.0123,0.5953±0.0067,0.5868±0.0233,...,0.5742±0.0135,0.5790±0.0038,0.5807±0.0109,0.5837±0.0108,0.5747±0.0108,0.5780±0.0052,0.5772±0.0180,0.5802±0.0085,0.5854±0.0086,0.5904±0.0063
glass-0-1-2-3_vs_4-5-6.dat,3.2,0.9258±0.0110,0.9184±0.0091,0.9236±0.0038,0.9216±0.0038,0.9213±0.0077,0.8911±0.0087,0.8915±0.0145,0.8999±0.0128,0.8947±0.0081,...,0.8818±0.0099,0.8814±0.0114,0.8774±0.0102,0.8801±0.0069,0.8774±0.0109,0.9085±0.0076,0.8966±0.0065,0.9059±0.0122,0.9015±0.0091,0.9082±0.0080
ecoli2.dat,5.46,0.8824±0.0092,0.8887±0.0077,0.8789±0.0096,0.8876±0.0105,0.8880±0.0111,0.8020±0.0107,0.8011±0.0120,0.7953±0.0058,0.8061±0.0103,...,0.7785±0.0117,0.7736±0.0044,0.7755±0.0179,0.7687±0.0120,0.7819±0.0159,0.8411±0.0082,0.8386±0.0115,0.8328±0.0063,0.8417±0.0139,0.8428±0.0105
vehicle1.dat,2.9,0.7187±0.0076,0.7176±0.0050,0.7138±0.0045,0.7181±0.0098,0.7202±0.0082,0.7166±0.0018,0.7178±0.0045,0.7183±0.0051,0.7156±0.0053,...,0.6838±0.0043,0.6818±0.0049,0.6841±0.0065,0.6851±0.0045,0.6883±0.0082,0.7191±0.0035,0.7188±0.0071,0.7179±0.0088,0.7184±0.0065,0.7188±0.0062
page-blocks0.dat,8.79,0.9008±0.0022,0.9004±0.0027,0.9029±0.0013,0.9009±0.0028,0.9001±0.0016,0.8337±0.0039,0.8341±0.0015,0.8334±0.0026,0.8291±0.0038,...,0.8180±0.0020,0.8190±0.0030,0.8184±0.0032,0.8162±0.0023,0.8155±0.0026,0.8142±0.0018,0.8151±0.0016,0.8128±0.0007,0.8138±0.0008,0.8142±0.0019
glass0.dat,2.06,0.8391±0.0059,0.8227±0.0124,0.8302±0.0213,0.8346±0.0136,0.8333±0.0073,0.8087±0.0051,0.8137±0.0162,0.8063±0.0121,0.8044±0.0064,...,0.7698±0.0088,0.7681±0.0104,0.7650±0.0177,0.7747±0.0112,0.7665±0.0079,0.8041±0.0110,0.8031±0.0060,0.8105±0.0116,0.7989±0.0217,0.7975±0.0089
glass1.dat,1.82,0.7852±0.0137,0.7905±0.0106,0.7903±0.0053,0.7809±0.0092,0.7898±0.0176,0.7472±0.0128,0.7460±0.0061,0.7579±0.0070,0.7665±0.0195,...,0.7230±0.0079,0.7209±0.0200,0.7199±0.0049,0.7247±0.0158,0.7182±0.0107,0.7447±0.0101,0.7554±0.0110,0.7520±0.0066,0.7396±0.0128,0.7485±0.0117
ecoli3.dat,8.6,0.7322±0.0129,0.7407±0.0203,0.7538±0.0176,0.7555±0.0111,0.7473±0.0230,0.7032±0.0074,0.7119±0.0089,0.7020±0.0081,0.7096±0.0106,...,0.6755±0.0050,0.6755±0.0074,0.6714±0.0128,0.6718±0.0072,0.6699±0.0043,0.6951±0.0051,0.7016±0.0065,0.6970±0.0016,0.6962±0.0054,0.6973±0.0097
wisconsin.dat,1.86,0.9575±0.0027,0.9561±0.0018,0.9577±0.0039,0.9566±0.0022,0.9570±0.0025,0.9597±0.0012,0.9600±0.0020,0.9565±0.0028,0.9576±0.0026,...,0.9513±0.0011,0.9537±0.0009,0.9539±0.0022,0.9548±0.0015,0.9559±0.0018,0.9612±0.0017,0.9560±0.0026,0.9504±0.0152,0.9549±0.0032,0.9551±0.0017


## G Mean

In [9]:
matrix_GM.insert(0, 'IR', IR)
matrix_GM

Unnamed: 0,IR,SMOTE-Bagging_BB-Imb,SMOTE-Bagging_MDM-Imb,SMOTE-Bagging_Complementarity,SMOTE-Bagging_Kappa,SMOTE-Bagging_RE-GM,Under-Bagging_BB-Imb,Under-Bagging_MDM-Imb,Under-Bagging_Complementarity,Under-Bagging_Kappa,...,RUS-Boost_BB-Imb,RUS-Boost_MDM-Imb,RUS-Boost_Complementarity,RUS-Boost_Kappa,RUS-Boost_RE-GM,EasyEnsemble_BB-Imb,EasyEnsemble_MDM-Imb,EasyEnsemble_Complementarity,EasyEnsemble_Kappa,EasyEnsemble_RE-GM
haberman.dat,2.78,0.5407±0.0118,0.5157±0.0054,0.5412±0.0202,0.5333±0.0142,0.5304±0.0161,0.5530±0.0128,0.5476±0.0141,0.5519±0.0080,0.5412±0.0270,...,0.5273±0.0167,0.5341±0.0040,0.5359±0.0128,0.5391±0.0127,0.5289±0.0124,0.5321±0.0058,0.5294±0.0208,0.5341±0.0100,0.5407±0.0098,0.5467±0.0078
glass-0-1-2-3_vs_4-5-6.dat,3.2,0.9240±0.0113,0.9160±0.0091,0.9214±0.0040,0.9190±0.0037,0.9186±0.0079,0.8849±0.0086,0.8848±0.0158,0.8940±0.0140,0.8893±0.0088,...,0.8747±0.0104,0.8742±0.0123,0.8692±0.0111,0.8732±0.0077,0.8692±0.0115,0.9029±0.0071,0.8889±0.0071,0.8999±0.0134,0.8952±0.0093,0.9024±0.0085
ecoli2.dat,5.46,0.8761±0.0104,0.8833±0.0093,0.8726±0.0105,0.8824±0.0113,0.8824±0.0119,0.7803±0.0130,0.7784±0.0152,0.7723±0.0067,0.7849±0.0126,...,0.7502±0.0148,0.7442±0.0061,0.7474±0.0221,0.7379±0.0153,0.7545±0.0193,0.8276±0.0095,0.8248±0.0133,0.8181±0.0079,0.8280±0.0153,0.8294±0.0108
vehicle1.dat,2.9,0.7025±0.0082,0.7014±0.0054,0.6978±0.0049,0.7026±0.0111,0.7048±0.0089,0.6885±0.0023,0.6899±0.0046,0.6904±0.0053,0.6872±0.0062,...,0.6534±0.0050,0.6514±0.0060,0.6541±0.0069,0.6549±0.0053,0.6582±0.0098,0.6930±0.0041,0.6926±0.0082,0.6916±0.0095,0.6930±0.0066,0.6921±0.0068
page-blocks0.dat,8.79,0.8964±0.0024,0.8958±0.0030,0.8986±0.0015,0.8965±0.0030,0.8956±0.0018,0.8175±0.0048,0.8180±0.0018,0.8171±0.0031,0.8119±0.0046,...,0.7986±0.0025,0.8001±0.0036,0.7992±0.0040,0.7965±0.0029,0.7956±0.0032,0.7932±0.0022,0.7944±0.0020,0.7916±0.0008,0.7927±0.0009,0.7933±0.0023
glass0.dat,2.06,0.8348±0.0059,0.8173±0.0130,0.8250±0.0224,0.8293±0.0136,0.8283±0.0076,0.7997±0.0056,0.8054±0.0179,0.7976±0.0134,0.7953±0.0068,...,0.7576±0.0088,0.7555±0.0116,0.7530±0.0182,0.7629±0.0124,0.7549±0.0087,0.7937±0.0125,0.7930±0.0051,0.8000±0.0130,0.7889±0.0228,0.7890±0.0092
glass1.dat,1.82,0.7824±0.0141,0.7871±0.0104,0.7871±0.0056,0.7776±0.0091,0.7866±0.0181,0.7396±0.0126,0.7386±0.0061,0.7513±0.0074,0.7598±0.0200,...,0.7123±0.0086,0.7104±0.0211,0.7088±0.0052,0.7144±0.0170,0.7082±0.0111,0.7340±0.0102,0.7458±0.0108,0.7434±0.0072,0.7299±0.0142,0.7386±0.0123
ecoli3.dat,8.6,0.6917±0.0180,0.6998±0.0302,0.7182±0.0201,0.7222±0.0144,0.7110±0.0291,0.6409±0.0087,0.6523±0.0133,0.6390±0.0106,0.6492±0.0140,...,0.6016±0.0075,0.5999±0.0113,0.5938±0.0201,0.5951±0.0095,0.5927±0.0082,0.6292±0.0076,0.6376±0.0102,0.6325±0.0029,0.6299±0.0076,0.6318±0.0152
wisconsin.dat,1.86,0.9572±0.0027,0.9558±0.0018,0.9575±0.0039,0.9563±0.0022,0.9567±0.0025,0.9592±0.0012,0.9595±0.0019,0.9560±0.0027,0.9572±0.0026,...,0.9507±0.0011,0.9532±0.0009,0.9535±0.0022,0.9544±0.0015,0.9554±0.0017,0.9606±0.0018,0.9548±0.0030,0.9543±0.0019,0.9552±0.0033,0.9526±0.0118


# Seleção dos melhores métodos a partir da média

In [23]:
index = ['Baseline','BB-Imb', 'MDM-Imb', 'Complementarity', 'Kappa', 'RE-GM']
index_ = 5*index
comparison = pd.DataFrame(index=index_)
comparison['AUC'] = np.insert(nump_aux[:, :].mean(axis=0).reshape(5,5), 0, base_ensembles_AUC, axis=1).reshape(1,-1)[0]
comparison['G-Mean'] = np.insert(nump_aux_gm[:, :].mean(axis=0).reshape(5,5), 0, base_ensembles_GM, axis=1).reshape(1,-1)[0]
# comparison.rename(index=index_)
comparison

Unnamed: 0,AUC,G-Mean
Baseline,0.754498,0.746089
BB-Imb,0.813445,0.800625
MDM-Imb,0.810707,0.796911
Complementarity,0.814525,0.802163
Kappa,0.814341,0.802128
RE-GM,0.814481,0.801586
Baseline,0.760649,0.756547
BB-Imb,0.784221,0.762613
MDM-Imb,0.785321,0.763822
Complementarity,0.784996,0.763279


In [25]:
for i in np.arange(5):
    print(i)
    print('Best AUC:',np.argmax(nump_aux[:, 5*i:5*i+5].mean(axis=0))+5*i+1,'-','Best G-Mean:',np.argmax(nump_aux_gm[:, 5*i:5*i+5].mean(axis=0))+5*i+1)

0
Best AUC: 3 - Best G-Mean: 3
1
Best AUC: 10 - Best G-Mean: 10
2
Best AUC: 14 - Best G-Mean: 14
3
Best AUC: 19 - Best G-Mean: 19
4
Best AUC: 25 - Best G-Mean: 25


## Baseline ensembles

In [11]:
print(base_ensembles_AUC)
print(base_ensembles_GM)

[0.75449804 0.76064879 0.74098859 0.72273448 0.77542117]
[0.74608945 0.75654653 0.73547134 0.70465117 0.77187658]


## AUC

In [26]:
matrix_AUC.iloc[:, [0, 3, 10, 14, 19, 25]]

Unnamed: 0,IR,SMOTE-Bagging_Complementarity,Under-Bagging_RE-GM,SMOTE-Boost_Kappa,RUS-Boost_Kappa,EasyEnsemble_RE-GM
haberman.dat,2.78,0.5795±0.0160,0.5813±0.0058,0.6035±0.0199,0.5837±0.0108,0.5904±0.0063
glass-0-1-2-3_vs_4-5-6.dat,3.2,0.9236±0.0038,0.9005±0.0051,0.9255±0.0063,0.8801±0.0069,0.9082±0.0080
ecoli2.dat,5.46,0.8789±0.0096,0.8068±0.0081,0.8671±0.0093,0.7687±0.0120,0.8428±0.0105
vehicle1.dat,2.9,0.7138±0.0045,0.7178±0.0019,0.6976±0.0019,0.6851±0.0045,0.7188±0.0062
page-blocks0.dat,8.79,0.9029±0.0013,0.8305±0.0040,0.8381±0.0011,0.8162±0.0023,0.8142±0.0019
glass0.dat,2.06,0.8302±0.0213,0.8087±0.0090,0.7760±0.0138,0.7747±0.0112,0.7975±0.0089
glass1.dat,1.82,0.7903±0.0053,0.7584±0.0092,0.7430±0.0200,0.7247±0.0158,0.7485±0.0117
ecoli3.dat,8.6,0.7538±0.0176,0.7088±0.0037,0.7321±0.0208,0.6718±0.0072,0.6973±0.0097
wisconsin.dat,1.86,0.9577±0.0039,0.9595±0.0026,0.9551±0.0031,0.9548±0.0015,0.9551±0.0017


## G Mean

In [27]:
matrix_GM.iloc[:,[0, 3, 10, 14, 19, 25]]

Unnamed: 0,IR,SMOTE-Bagging_Complementarity,Under-Bagging_RE-GM,SMOTE-Boost_Kappa,RUS-Boost_Kappa,EasyEnsemble_RE-GM
haberman.dat,2.78,0.5412±0.0202,0.5351±0.0073,0.5666±0.0252,0.5391±0.0127,0.5467±0.0078
glass-0-1-2-3_vs_4-5-6.dat,3.2,0.9214±0.0040,0.8951±0.0055,0.9228±0.0065,0.8732±0.0077,0.9024±0.0085
ecoli2.dat,5.46,0.8726±0.0105,0.7858±0.0099,0.8593±0.0096,0.7379±0.0153,0.8294±0.0108
vehicle1.dat,2.9,0.6978±0.0049,0.6899±0.0021,0.6760±0.0020,0.6549±0.0053,0.6921±0.0068
page-blocks0.dat,8.79,0.8986±0.0015,0.8136±0.0050,0.8239±0.0013,0.7965±0.0029,0.7933±0.0023
glass0.dat,2.06,0.8250±0.0224,0.7989±0.0094,0.7677±0.0142,0.7629±0.0124,0.7890±0.0092
glass1.dat,1.82,0.7871±0.0056,0.7515±0.0104,0.7371±0.0206,0.7144±0.0170,0.7386±0.0123
ecoli3.dat,8.6,0.7182±0.0201,0.6484±0.0064,0.6894±0.0259,0.5951±0.0095,0.6318±0.0152
wisconsin.dat,1.86,0.9575±0.0039,0.9591±0.0026,0.9549±0.0031,0.9544±0.0015,0.9526±0.0118
