<a href="https://colab.research.google.com/github/lholiveiraa/mvp-4/blob/main/MVP_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#configuração pra não exibir warnings
import warnings
warnings.filterwarnings("ignore")

# imports necessários
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, f1_score, classification_report

import joblib

Carga do Dataset

In [None]:

# Informa a url de importação do dataset
url = 'https://raw.githubusercontent.com/lholiveiraa/datasets/main/heart.csv'

# Lê o arquivo
df = pd.read_csv(url)

# Mostra as primeiras linhas do dataset
df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Separação em conjunto de treino e conjunto de teste com holdout

In [None]:
test_size = 0.2 # tamanho do conjunto de teste
seed = 42 # semente aleatória

# Separação entre features e target
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Holdout com estratificação
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=seed, stratify=y)

 Modelagem e Inferência

Criação e avaliação de modelos: linha base

In [None]:

numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Pipeline para variáveis numéricas
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline para variáveis categóricas
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combinação dos pipelines com ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

Criação e Avaliação dos modelos com dados transformados

In [None]:
# Definição dos modelos e seus hiperparâmetros para grid search
models = {
    'KNN': (KNeighborsClassifier(), {'clf__n_neighbors': [3, 5, 7]}),
    'Tree': (DecisionTreeClassifier(random_state=42), {'clf__max_depth': [3, 5, 10]}),
    'NB': (GaussianNB(), {}),
    'SVM': (SVC(), {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf']})
}

best_models = {}
results = []

# Loop de treinamento e avaliação com GridSearchCV
for name, (model, params) in models.items():
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', model)
    ])

    grid = GridSearchCV(pipeline, params, cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name} Report:\n", classification_report(y_test, y_pred))

    results.append({
        'model': name,
        'accuracy': acc,
        'f1': f1,
        'best_params': grid.best_params_
    })

    best_models[name] = grid.best_estimator_

# Tabela comparativa dos modelos ordenada por F1-score
result_df = pd.DataFrame(results).sort_values(by='f1', ascending=False)
print("\nComparativo de Modelos:\n", result_df)



KNN Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90        82
           1       0.91      0.93      0.92       102

    accuracy                           0.91       184
   macro avg       0.91      0.91      0.91       184
weighted avg       0.91      0.91      0.91       184


Tree Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81        82
           1       0.87      0.79      0.83       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.83      0.82      0.82       184


NB Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.87        82
           1       0.89      0.90      0.90       102

    accuracy                           0.89       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.89      0.8

Exportação do Modelo e Dados de Teste

In [None]:
# Finalizacao do modelo
melhor_modelo_nome = result_df.iloc[0]['model']
melhor_modelo = best_models[melhor_modelo_nome]

# Exporta o modelo treinado
joblib.dump(melhor_modelo, 'melhor_modelo.pkl')

# Extrai o preprocessor já ajustado do pipeline do melhor modelo
fitted_preprocessor = melhor_modelo.named_steps['preprocess']
X_test_transformed = fitted_preprocessor.transform(X_test)
np.save('X_test.npy', X_test_transformed)

np.save('y_test.npy', y_test.values)
