In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

# 1) Preparación de datos
# Original: 0 = no disease, 1–4 = disease
df = pd.read_csv('./datasets/heart_disease.csv')  # Asegúrate de cargar tus datos correctamente
df = df.dropna() 
df['num'] = (df['num'] > 0).astype(int)

X = df.drop(columns='num')
y = df['num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) Definir el pipeline (escalado + regresión logística)
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=1000, random_state=42))
])

# Si hay desbalanceo, puedes activar esto:
# pipe.set_params(log_reg__class_weight='balanced')

# 3) Entrenar
pipe.fit(X_train, y_train)

# 4) Predecir y evaluar
y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nROC AUC:", roc_auc_score(y_test, y_pred_proba))

# Curva ROC (opcional si estás en notebook)
RocCurveDisplay.from_estimator(pipe, X_test, y_test)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values