In [1]:
import numpy as np
import scipy.stats
import pandas as pd
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman



In [9]:
def confidence_interval(data, confidence=0.95, decp = 3):
    # xbar - média amostral
    # decp - aprocimação decimal
    if data.ndim == 1:
        n = len(data)
        xbar = round(data.mean(),decp)
        
        # Desvio padrão amostral
        se = data.std() / n**(0.5)
        h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
        h = round(h,decp)
        return  [xbar,(xbar-h, xbar+h)]
    else:
        ic = {}
        n = data.shape[1]   
        names = list(data.columns)
        for j in range(n):
            x = data[names[j]]
            xbar = x.mean()
            se = x.std() / n**(0.5)
            h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
            
            ic[names[j]] = [round(xbar,3),( round(xbar-h,decp) , round(xbar+h,decp) )]
        return ic

In [10]:
er = pd.read_csv('resultados/error_classification.csv')
prc = pd.read_csv('resultados/precision.csv')
rcl = pd.read_csv('resultados/recall.csv')
fms = pd.read_csv('resultados/fmeasure.csv')

## Estimação pontual e intervalar

In [11]:
print('Confidence Interval - Error rate')
confidence_interval(er)

Confidence Interval - Error rate


{'knn': [0.428, (0.391, 0.465)],
 'parzen': [0.415, (0.378, 0.452)],
 'bayesian': [0.438, (0.405, 0.472)],
 'logistic': [0.458, (0.432, 0.484)],
 'ensemble': [0.418, (0.382, 0.453)]}

In [12]:
print('Confidence Interval - Recall')
confidence_interval(rcl)


Confidence Interval - Recall


{'knn': [0.559, (0.507, 0.61)],
 'parzen': [0.556, (0.508, 0.604)],
 'bayesian': [0.544, (0.476, 0.611)],
 'logistic': [0.311, (0.277, 0.346)],
 'ensemble': [0.555, (0.506, 0.603)]}

In [13]:
print('Confidence Interval - Fmeasure')
confidence_interval(fms)


Confidence Interval - Fmeasure


{'knn': [0.554, (0.496, 0.612)],
 'parzen': [0.566, (0.521, 0.611)],
 'bayesian': [0.512, (0.447, 0.576)],
 'logistic': [0.328, (0.29, 0.365)],
 'ensemble': [0.56, (0.511, 0.609)]}

## Testes de hipóteses (Friedman e posthot)

In [14]:
# Error rate
knn_er = er['knn'].to_numpy()
parzen_er = er['parzen'].to_numpy()
bayesian_er = er['bayesian'].to_numpy()
logistic_er = er['logistic'].to_numpy()
ensemble_er = er['ensemble'].to_numpy()

print('Friedman test - error rate')
friedmanchisquare(knn_er, parzen_er, bayesian_er,logistic_er,ensemble_er)

Friedman test - error rate


FriedmanchisquareResult(statistic=111.05188199389623, pvalue=4.3414821862931007e-23)

In [15]:
posthoc_nemenyi_friedman(er)


Unnamed: 0,knn,parzen,bayesian,logistic,ensemble
knn,1.0,0.001577,0.579796,0.001,0.060732
parzen,0.001577,1.0,0.001,0.001,0.777213
bayesian,0.579796,0.001,1.0,0.001,0.001
logistic,0.001,0.001,0.001,1.0,0.001
ensemble,0.060732,0.777213,0.001,0.001,1.0


In [16]:
# precision
knn_prc = prc['knn'].to_numpy()
parzen_prc = prc['parzen'].to_numpy()
bayesian_prc = prc['bayesian'].to_numpy()
logistic_prc = prc['logistic'].to_numpy()
ensemble_prc = prc['ensemble'].to_numpy()
 
print('Friedman test - precision')
friedmanchisquare(knn_prc, parzen_prc, bayesian_prc,logistic_prc,ensemble_prc)


Friedman test - precision


FriedmanchisquareResult(statistic=169.808, pvalue=1.1499387136442612e-35)

In [17]:
posthoc_nemenyi_friedman(prc)


Unnamed: 0,knn,parzen,bayesian,logistic,ensemble
knn,1.0,0.001,0.013574,0.001,0.001
parzen,0.001,1.0,0.001,0.001,0.9
bayesian,0.013574,0.001,1.0,0.011024,0.001
logistic,0.001,0.001,0.011024,1.0,0.001
ensemble,0.001,0.9,0.001,0.001,1.0


In [18]:
# recall
knn_rcl = rcl['knn'].to_numpy()
parzen_rcl = rcl['parzen'].to_numpy()
bayesian_rcl = rcl['bayesian'].to_numpy()
logistic_rcl = rcl['logistic'].to_numpy()
ensemble_rcl = rcl['ensemble'].to_numpy()
 
print('Friedman test - recall ')
friedmanchisquare(knn_rcl, parzen_rcl, bayesian_rcl,logistic_rcl,ensemble_rcl)


Friedman test - recall 


FriedmanchisquareResult(statistic=102.68799999999999, pvalue=2.6330010266812288e-21)

In [20]:
posthoc_nemenyi_friedman(rcl)



Unnamed: 0,knn,parzen,bayesian,logistic,ensemble
knn,1.0,0.9,0.543903,0.001,0.687481
parzen,0.9,1.0,0.9,0.001,0.9
bayesian,0.543903,0.9,1.0,0.001,0.9
logistic,0.001,0.001,0.001,1.0,0.001
ensemble,0.687481,0.9,0.9,0.001,1.0


In [21]:
# fmeasure
knn_fms = fms['knn'].to_numpy()
parzen_fms = fms['parzen'].to_numpy()
bayesian_fms = fms['bayesian'].to_numpy()
logistic_fms = fms['logistic'].to_numpy()
ensemble_fms = fms['ensemble'].to_numpy()
 
print('Friedman test - F-measure')
friedmanchisquare(knn_fms, parzen_fms, bayesian_fms,logistic_fms,ensemble_fms)


Friedman test - F-measure


FriedmanchisquareResult(statistic=137.40800000000013, pvalue=1.0127205351373547e-28)

In [22]:
posthoc_nemenyi_friedman(fms)


Unnamed: 0,knn,parzen,bayesian,logistic,ensemble
knn,1.0,0.469991,0.001,0.001,0.9
parzen,0.469991,1.0,0.001,0.001,0.831053
bayesian,0.001,0.001,1.0,0.001,0.001
logistic,0.001,0.001,0.001,1.0,0.001
ensemble,0.9,0.831053,0.001,0.001,1.0


In [26]:
best_hyper = pd.read_csv('resultados/best_hyper.csv')
best_hyper

Unnamed: 0,k,h
0,3.0,0.0565
1,9.0,0.0565
2,11.0,0.0565
3,9.0,0.0565
4,13.0,0.0565
5,13.0,0.0565
6,3.0,0.0565
7,9.0,0.0565
8,5.0,0.0565
9,7.0,0.0565
