In [103]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.metrics import make_scorer, recall_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from scipy.stats import ttest_rel

sys.path.append(os.path.abspath('..'))
from processamento import df

In [104]:
# Definir as doenças respiratórias
respiratory_conditions = ['Asthma', 'Chronic obstructive pulmonary disease']

# Criar o novo atributo RespDisease: 1 se for doença respiratória, 0 caso contrário
df['RespDisease'] = df['Outcome'].apply(lambda x: 1 if x in respiratory_conditions else 0)

# Obter dataframe apenas com valores númericos
df_num = df.select_dtypes(include=[np.number])

# Contagem específica
asthma_count = df['Outcome'].value_counts().get('Asthma', 0)
copd_count = df['Outcome'].value_counts().get('Chronic obstructive pulmonary disease', 0)

print(f"Asthma: {asthma_count}")
print(f"Chronic obstructive pulmonary disease: {copd_count}")

# Confirmar com RespDisease
print(f"Doenças respiratórias: {(df['RespDisease'] == 1).sum()}")


Asthma: 10254
Chronic obstructive pulmonary disease: 8485
Doenças respiratórias: 18739


# **4.3.2 Modelos de previsão de doenças respiratórias**

In [None]:
# Define especificidade
def specificity_score(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

specificity_scorer = make_scorer(specificity_score)

# Define métricas
scoring = {
    'Accuracy': 'accuracy',
    'Sensitivity': 'recall',
    'Specificity': specificity_scorer,
    'F1': 'f1'
}

X = df_num.drop(columns=['RespDisease'])
y = df_num['RespDisease']

#### a. Modelo árvore de decisão

In [None]:
# Hiperparâmetros do árvore de decisão
dt_model_params = {
    'max_depth': [None, 5, 10, 20], 
    'min_samples_split': [2, 5, 10] 
}

# Otimizar hiperparâmetros do DecisionTreeClassifier
grid = GridSearchCV(
    DecisionTreeClassifier(),
    dt_model_params,
    cv=5,                                # 5-fold cross-validation
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Ajustar (treinar) o modelo com os dados
grid.fit(X, y)

# Obter os melhores parâmetros e a melhor pontuação
dt_best_params = grid.best_params_
dt_best_score = -grid.best_score_.round(2)
print(f"Best Decision Tree Parameters: {dt_best_params}")
print(f"Best Score: {dt_best_score}")

# Treinar o modelo com os melhores parâmetros
dt_model = DecisionTreeClassifier(**dt_best_params)

# Avaliar desempenho do modelo 
scores = cross_validate(dt_model, X, y, cv=5, scoring=scoring, n_jobs=-1)

# Calcular as métricas de avaliação
dt_metrics = {}
for metric in scoring.keys():
    dt_metrics[metric] = (
        np.mean(scores[f'test_{metric}']), 
        np.std(scores[f'test_{metric}'])
    )

# Exibir as métricas de avaliação do modelo
print("Decision Tree Evaluation:")
for metric, (mean, std) in dt_metrics.items():
    print(f"{metric}: Mean = {mean:.3f}, Std = {std:.3f}")

# Guarda o modelo 
decision_tree = dt_model

Best Decision Tree Parameters: {'max_depth': 10, 'min_samples_split': 10}
Best Score: 0.22
Decision Tree Evaluation:
Accuracy: Mean = 0.782, Std = 0.024
Sensitivity: Mean = 0.511, Std = 0.049
Specificity: Mean = 0.949, Std = 0.035
F1: Mean = 0.640, Std = 0.042


#### b. Modelo rede neuronal

In [None]:
nn_model = make_pipeline(StandardScaler(), MLPClassifier(max_iter=4000))

# Hiperparâmetros do modelo de rede neural
nn_model_params = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'mlpclassifier__learning_rate_init': [0.001, 0.01]
}

# Otimizar hiperparâmetros do MLPClassifier
grid = GridSearchCV(nn_model, nn_model_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar (treinar) o modelo com os dados
grid.fit(X, y)

# Obter os melhores parâmetros e a melhor pontuação
nn_best_params = grid.best_params_
nn_best_score = -grid.best_score_.round(2)
print(f"Best Neural Network Parameters: {nn_best_params}")
print(f"Best Score: {dt_best_score}")

# Treinar o modelo com os melhores parâmetros
nn_model.set_params(**nn_best_params)

# Avaliar desempenho do modelo 
scores = cross_validate(nn_model, X, y, cv=5, scoring=scoring, n_jobs=-1)

# Calcular as métricas de avaliação
nn_metrics = {}
for metric in scoring.keys():
    nn_metrics[metric] = (np.mean(scores[f'test_{metric}']), np.std(scores[f'test_{metric}']))

# Exibir as métricas de avaliação do modelo
print("Neural Network Evaluation:")
for metric, (mean, std) in nn_metrics.items():
    print(f"{metric}: Mean = {mean:.3f}, Std = {std:.3f}")

neural_network = nn_model

Best Neural Network Parameters: {'mlpclassifier__hidden_layer_sizes': (50, 50), 'mlpclassifier__learning_rate_init': 0.01}
Best Score: 0.22
Neural Network Evaluation:
Accuracy: Mean = 0.757, Std = 0.018
Sensitivity: Mean = 0.456, Std = 0.098
Specificity: Mean = 0.942, Std = 0.038
F1: Mean = 0.581, Std = 0.068


#### c. Modelo SVM

In [None]:
svm_model = make_pipeline(StandardScaler(), SVC())

# Hiperparâmetros do modelo SVM
svm_model_params = {
    'svc__kernel': ['linear', 'rbf'],
}

# Otimizar hiperparâmetros do SVM
grid = GridSearchCV(svm_model, svm_model_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar (treinar) o modelo com os dados
grid.fit(X, y)

# Obter os melhores parâmetros e a melhor pontuação
svm_best_params = grid.best_params_
svm_best_score = -grid.best_score_.round(2)
print(f"Best SVM Parameters: {svm_best_params}")
print(f"Best Score: {dt_best_score}")

# Treinar o modelo com os melhores parâmetros
svm_model.set_params(**svm_best_params)

# Avaliar desempenho do modelo
scores = cross_validate(svm_model, X, y, cv=5, scoring=scoring, n_jobs=-1)

# Calcular as métricas de avaliação
svm_metrics = {}
for metric in scoring.keys():
    svm_metrics[metric] = (np.mean(scores[f'test_{metric}']), np.std(scores[f'test_{metric}']))

# Exibir as métricas de avaliação do modelo
print("SVM Evaluation:")
for metric, (mean, std) in svm_metrics.items():
    print(f"{metric}: Mean = {mean:.3f}, Std = {std:.3f}")

svm = svm_model

Best SVM Parameters: {'svc__kernel': 'rbf'}
Best Score: 0.22
SVM Evaluation:
Accuracy: Mean = 0.691, Std = 0.015
Sensitivity: Mean = 0.189, Std = 0.040
Specificity: Mean = 1.000, Std = 0.000
F1: Mean = 0.316, Std = 0.059


#### d. Modelo K-vizinhos-mais-próximos

In [None]:
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier())

# Hiperparâmetros do modelo KNN
knn_model_params = {
    'kneighborsclassifier__n_neighbors': [3, 5, 10],
}

# Otimizar hiperparâmetros do KNeighborsClassifier
grid = GridSearchCV(knn_model, knn_model_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar (treinar) o modelo com os dados
grid.fit(X, y)

# Obter os melhores parâmetros e a melhor pontuação
knn_best_params = grid.best_params_
knn_best_score = -grid.best_score_.round(2)
print(f"Best KNN Parameters: {knn_best_params}")
print(f"Best Score: {dt_best_score}")

# Treinar o modelo com os melhores parâmetros
knn_model.set_params(**knn_best_params)

# Avaliar desempenho do modelo
scores = cross_validate(knn_model, X, y, cv=5, scoring=scoring, n_jobs=-1)

# Calcular as métricas de avaliação
knn_metrics = {}
for metric in scoring.keys():
    knn_metrics[metric] = (np.mean(scores[f'test_{metric}']), np.std(scores[f'test_{metric}']))

# Exibir as métricas de avaliação do modelo
print("KNN Evaluation:")
for metric, (mean, std) in knn_metrics.items():
    print(f"{metric}: Mean = {mean:.3f}, Std = {std:.3f}")

knn = knn_model

Best KNN Parameters: {'kneighborsclassifier__n_neighbors': 10}
Best Score: 0.22
KNN Evaluation:
Accuracy: Mean = 0.728, Std = 0.011
Sensitivity: Mean = 0.429, Std = 0.047
Specificity: Mean = 0.912, Std = 0.037
F1: Mean = 0.544, Std = 0.028


In [None]:
# Definir métricas de avaliação a usar na cross_validate
scoring = {
    'Accuracy': 'accuracy',
    'Sensitivity': 'recall',
    'Specificity': specificity_scorer,
    'F1': 'f1'
}

# Modelos a comparar
model_a = decision_tree
model_b = neural_network

# Prepara X e y
X = df_num.drop(columns=['RespDisease'])
y = df_num['RespDisease']

# Avaliar desempenho dos modelos
scores_a = cross_validate(model_a, X, y, cv=5, scoring=scoring, n_jobs=-1)
scores_b = cross_validate(model_b, X, y, cv=5, scoring=scoring, n_jobs=-1)

# Comparar os modelos usando o teste t Student para cada métrica
results = []
for metric in scoring.keys():
    a_scores = scores_a[f'test_{metric}']
    b_scores = scores_b[f'test_{metric}']
    t_stat, p_val = ttest_rel(a_scores, b_scores)
    significant = p_val < 0.05
    results.append({
        'Metric': metric,
        'Model A Mean': np.mean(a_scores),
        'Model B Mean': np.mean(b_scores),
        't-statistic': t_stat,
        'p-value': p_val,
        'Significant (α=0.05)': significant
    })

# Mostrar resultados 
pd.DataFrame(results)

Unnamed: 0,Metric,Model A Mean,Model B Mean,t-statistic,p-value,Significant (α=0.05)
0,Accuracy,0.781644,0.766585,1.956553,0.122036,False
1,Sensitivity,0.510966,0.552645,-2.971801,0.041071,True
2,Specificity,0.948488,0.898458,2.611616,0.059319,False
3,F1,0.640129,0.643073,-0.445202,0.679205,False
