In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from matplotlib.pylab import rcParams
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import cohen_kappa_score
from itertools import combinations
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score

import kdn
import pruning
import diversity

  from numpy.core.umath_tests import inner1d


### Pre processing

In [2]:
data = pd.read_csv('../cm1.csv')

In [3]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
len(X)

498

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Analysis

In [5]:
cenario = ['Conjunto Original', 'Instâncias Difíceis (kDN > 0.4)', 'Instâncias Fáceis (kDN <= 0.4)']

skf = StratifiedKFold(n_splits=10, shuffle=True)

metrics_bag = np.zeros(4)
metrics_best = np.zeros((3, 4))
metrics_red = np.zeros((3, 4))

divers_best = np.zeros((3,2))
divers_red = np.zeros((3,2))

for train_index, test_index in skf.split(X, y):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
            
    sm = SMOTE()
    X_train, y_train = sm.fit_sample(X_train, y_train)

    #---------------------------------------BAGGING ORIGINAL---------------------------------------#
    bg = BaggingClassifier(Perceptron(max_iter = 150, tol = 0.001),
                         n_estimators = 100)
    bg.fit(X_train, y_train)
    AUX = bg.estimators_[:]
    metrics_bag += np.array([bg.score(X_test, y_test),
                            roc_auc_score(y_test, bg.predict_proba(X_test)[:,1]),
                            geometric_mean_score(y_test, bg.predict(X_test)),
                            f1_score(y_test, bg.predict(X_test))])
    #---------------------------------------------------------------------------------------------#
    
    easy = []
    hard = []
    vis = kdn.kDN(X_train, y_train)
    for i, k in enumerate(vis):
        if k < 0.4:
            easy.append(i)
        else: 
            hard.append(i)
            
    #--------------------------------------BAGGING PODADO-----------------------------------------#
    all_ = list(range(len(y_train)))
    valid = [all_, hard, easy]
    for i, val in enumerate(valid):
        print(i)
        bg.estimators_ = AUX[:]
        
        prune1 = pruning.best_first(bg, X_train[val], y_train[val])
        prune2 = pruning.reduce_error(bg, X_train[val], y_train[val])
        
        #--------------------------------------BEST FIRST-----------------------------------------#
        divers_best[i, :] += np.array([diversity.disagreement(prune1, X_train, y_train),
                                      diversity.kappa(prune1, X_train, y_train)])
        bg.estimators_ = prune1       
        metrics_best[i, :] += np.array([bg.score(X_test, y_test),
                            roc_auc_score(y_test, bg.predict_proba(X_test)[:,1]),
                            geometric_mean_score(y_test, bg.predict(X_test)),
                            f1_score(y_test, bg.predict(X_test))])       
                
        #-------------------------------------REDUCE ERROR----------------------------------------#
        divers_red[i, :] += np.array([diversity.disagreement(prune2, X_train, y_train),
                                      diversity.kappa(prune2, X_train, y_train)])
        bg.estimators_ = prune2
        metrics_red[i,:] += np.array([bg.score(X_test, y_test),
                            roc_auc_score(y_test, bg.predict_proba(X_test)[:,1]),
                            geometric_mean_score(y_test, bg.predict(X_test)),
                            f1_score(y_test, bg.predict(X_test))])
    
    #---------------------------------------------------------------------------------------------#
    
metrics_bag = metrics_bag / skf.n_splits

metrics_best = metrics_best / skf.n_splits
metrics_red = metrics_red / skf.n_splits

divers_best = divers_best / skf.n_splits
divers_red = divers_red / skf.n_splits

print('Bagging:')
print('Metricas:', metrics_bag, '\n \n \n')
print('Best first:')
print('Metricas:', metrics_best)
print('Diversidade:', divers_best, '\n \n \n')
print('Reduce Error:')
print('Metricas:', metrics_red)
print('Diversidade:', divers_red, '\n \n \n')

0
*best 2
*red 6
1
*best 2
*red 2


  log_a = np.log(a)


2
*best 4
*red 3
0
*best 6
*red 4
1
*best 2
*red 2
2
*best 2
*red 5
0
*best 9
*red 11
1
*best 2
*red 2


  log_a = np.log(a)


2
*best 5
*red 3
0
*best 8
*red 4
1
*best 2
*red 2


  log_a = np.log(a)


2
*best 3
*red 6
0
*best 8
*red 3
1
*best 2
*red 2
2
*best 10
*red 4
0
*best 5
*red 4
1
*best 2
*red 2
2
*best 3
*red 4
0
*best 3
*red 4
1
*best 2
*red 2
2
*best 5
*red 3
0
*best 6
*red 7
1
*best 4
*red 2
2
*best 6
*red 3
0
*best 3
*red 4
1
*best 2
*red 2


  log_a = np.log(a)
  'precision', 'predicted', average, warn_for)


2
*best 2
*red 6
0
*best 4
*red 3
1
*best 2
*red 2


  log_a = np.log(a)


2
*best 2
*red 3
Bagging:
Metricas: [0.79541667 0.79940152 0.72043489 0.38333542] 
 
 

Best first:
Metricas: [[0.78925    0.76047601 0.68863618 0.35843412]
 [0.87141667 0.69313889 0.4098133  0.26049062]
 [0.74316667 0.7539697  0.71953816 0.35886758]]
Diversidade: [[0.09452802 0.81033373]
 [0.21650257 0.46760666]
 [0.09619347 0.80065363]] 
 
 

Reduce Error:
Metricas: [[0.80708333 0.79609343 0.73696314 0.4069846 ]
 [0.87525    0.61750379 0.28439517 0.1862987 ]
 [0.74308333 0.77520455 0.71400593 0.34914921]]
Diversidade: [[ 0.2385997   0.52832017]
 [ 0.54378224 -0.08429818]
 [ 0.17928457  0.63735807]] 
 
 

