In [24]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

from utils import do_cv_knn

In [3]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,AGE_50,MD_50,SBP_50,DBP_50,HT_50,WT_50,CHOL_50,SES,CL_STATUS,MD_62,SBP_62,DBP_62,CHOL_62,WT_62,IHD_DX,DEATH
0,42,1,110,65,64,147,291,2,8,4,120,78,271,146,2,1
1,53,1,130,72,69,167,278,1,6,2,122,68,250,165,9,1
2,53,2,120,90,70,222,342,4,8,1,132,90,304,223,2,1
3,48,4,120,80,72,229,239,4,8,2,118,68,209,227,3,1
4,53,3,118,74,66,134,243,3,8,5,118,56,261,138,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,50,1,115,80,66,148,300,2,8,1,115,65,273,152,0,0
196,23,1,110,70,69,137,120,3,8,2,112,76,198,153,0,0
197,20,3,130,80,66,150,210,5,0,1,130,85,274,158,0,0
198,46,3,140,84,66,138,130,4,6,2,148,88,160,157,0,0


In [4]:
df.isnull().sum().sum()

0

In [5]:
y = df['DEATH'].values.ravel()
X = df.drop('DEATH', axis=1)

In [10]:
accs_knn = do_cv_knn(X.values, y, 10, range(1, 20, 2))

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
def calcular_estatisticas(resultados):
    return np.mean(resultados), np.std(resultados), np.min(resultados), np.max(resultados)

def imprimir_estatisticas(resultados):
    media, desvio, mini, maxi = calcular_estatisticas(resultados)
    print("Resultados: %.2f +- %.2f, min: %.2f, max: %.2f" % (media, desvio, mini, maxi))

In [15]:
imprimir_estatisticas(accs_knn)

Resultados: 0.69 +- 0.06, min: 0.60, max: 0.80


In [21]:
from sklearn.svm import SVC
import itertools

In [26]:
def selecionar_melhor_svm(Cs, gammas, X_treino, X_val, y_treino, y_val):
    
    def treinar_svm(C, gamma, X_treino, X_val, y_treino, y_val):
        svm = SVC(C=C, gamma=gamma)
        svm.fit(X_treino, y_treino)
        pred = svm.predict(X_val)
        return accuracy_score(y_val, pred)
        
    combinacoes_parametros = list(itertools.product(Cs, gammas))
        
    acuracias_val = Parallel(n_jobs=4)(delayed(treinar_svm)
                                       (c, g, X_treino, X_val, y_treino, y_val) for c, g in combinacoes_parametros)       
        
    melhor_val = max(acuracias_val)
    melhor_comb = combinacoes_parametros[np.argmax(acuracias_val)]   
    melhor_c = melhor_comb[0]
    melhor_gamma = melhor_comb[1]
    svm = SVC(C=melhor_c, gamma=melhor_gamma)
    svm.fit(np.vstack((X_treino, X_val)), [*y_treino, *y_val])

    return svm, melhor_comb, melhor_val
    
def do_cv_svm(X, y, cv_splits, Cs=[1], gammas=['scale']):

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1)

    acuracias = []
    
    pgb = tqdm(total=cv_splits, desc='Folds avaliados')
    
    for treino_idx, teste_idx in skf.split(X, y):

        X_treino = X[treino_idx]
        y_treino = y[treino_idx]

        X_teste = X[teste_idx]
        y_teste = y[teste_idx]

        X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, stratify=y_treino, test_size=0.2, random_state=1)

        ss = StandardScaler()
        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)
        X_val = ss.transform(X_val)

        svm, _, _ = selecionar_melhor_svm(Cs, gammas, X_treino, X_val, y_treino, y_val)
        pred = svm.predict(X_teste)

        acuracias.append(accuracy_score(y_teste, pred))
        
        #print(classification_report(y_teste, pred, output_dict=True))
        
        pgb.update(1)
        
    pgb.close()
    
    return acuracias

In [33]:
accs_svm = do_cv_svm(X.values, y, 10, Cs=[1, 10, 100, 1000], gammas=['scale', 'auto', 2e-2, 2e-3, 2e-4])

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]

In [34]:
imprimir_estatisticas(accs_knn)

Resultados: 0.69 +- 0.06, min: 0.60, max: 0.80


In [35]:
imprimir_estatisticas(accs_svm)

Resultados: 0.71 +- 0.05, min: 0.65, max: 0.80


In [36]:
from scipy.stats import ttest_ind_from_stats

In [37]:
media_knn, std_knn, _, _ = calcular_estatisticas(accs_knn)
media_svm, std_svm, _, _ = calcular_estatisticas(accs_svm)
ttest_ind_from_stats(media_knn, std_knn, len(accs_knn), media_svm, std_svm, len(accs_svm))

Ttest_indResult(statistic=-0.4099600308453987, pvalue=0.6866732489128373)

In [43]:
ttest_ind_from_stats(12, 3, 20, 10,4, 20)

Ttest_indResult(statistic=1.7888543819998317, pvalue=0.08161241660950144)

In [19]:
import itertools
Cs = [1, 10, 100, 1000]
gammas = ['auto', 'scale', 2e-1, 2e-2, 2e-3]

for c, k in itertools.product(Cs, gammas):
    print (c, k)


1 auto
1 scale
1 0.2
1 0.02
1 0.002
10 auto
10 scale
10 0.2
10 0.02
10 0.002
100 auto
100 scale
100 0.2
100 0.02
100 0.002
1000 auto
1000 scale
1000 0.2
1000 0.02
1000 0.002
