# Pipeline - Doenças Cardiacas

In [15]:
# Importando bibliotecas
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [16]:
# Carregando os dados
df = pd.read_csv('../../datasets/heart_disease_uci.csv')

In [17]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [18]:
# Separar features e target
X = df.drop('num', axis=1)
y = df['num']

In [19]:
# Identificar os tipos de variáveis
numeric_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs','restecg', 'exang', 'slope', 'ca', 'thal']

In [20]:
# Pipeline de pré-processamento
numeric_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler())
])

In [21]:
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [22]:
preprocessor = ColumnTransformer([
  ('num', numeric_pipeline, numeric_cols),
  ('cat', categorical_pipeline, categorical_cols)
])

In [23]:
# Pipeline Completo
pipeline = Pipeline([
  ('preprocessor', preprocessor),
  ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [24]:
# Separar em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [25]:
# Treinar o pipeline
pipeline.fit(X_train, y_train)

In [26]:
# Avaliação com acurácia
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Acurácia no conjunto de teste: {acc:.4f}")

Acurácia no conjunto de teste: 0.5598


In [27]:
# Validação cruzada
cv_score = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f'Acurárcia Média: {cv_score.mean():.4f}')

Acurárcia Média: 0.4859


In [28]:
# Salvar o modelo treinado
joblib.dump(pipeline,'modelo_pipeline_heart.pkl')
print("Modelo salvo com sucesso.")

Modelo salvo com sucesso.
