In [None]:
import pandas as pd

df = pd.read_csv(r"C:\Users\johan\OneDrive - Universidad del Norte\Escritorio\MachineLearning\heart-disease-mlops\heart.csv")
print("Dimensiones del dataset:", df.shape)
df.head()


Dimensiones del dataset: (918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [None]:
print("\nValores nulos por columna:\n")
print(df.isnull().sum())

cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    print(f"\n{col}: {df[col].unique()}")


Valores nulos por columna:

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

Sex: ['M' 'F']

ChestPainType: ['ATA' 'NAP' 'ASY' 'TA']

RestingECG: ['Normal' 'ST' 'LVH']

ExerciseAngina: ['N' 'Y']

ST_Slope: ['Up' 'Flat' 'Down']


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

df_copy = df.copy()

df_encoded = pd.get_dummies(df_copy, drop_first=True)

X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

np.random.seed(0)

X["leaky_feature"] = y + np.random.normal(0, 0.01, size=len(y))

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

grid_leak = GridSearchCV(SVC(probability=True), param_grid={"C": [0.1, 1, 10]}, cv=5, scoring="roc_auc")
grid_leak.fit(X_train_l, y_train_l)
auc_leak = roc_auc_score(y_test_l, grid_leak.predict_proba(X_test_l)[:, 1])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("svc", SVC(probability=True))
])

param_grid = {"svc__C": [0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="roc_auc")
grid.fit(X_train, y_train)
auc_clean = roc_auc_score(y_test, grid.predict_proba(X_test)[:, 1])

print(f"AUC con fuga de datos: {auc_leak:.3f}")
print(f"AUC sin fuga de datos: {auc_clean:.3f}")

AUC con fuga de datos: 1.000
AUC sin fuga de datos: 1.000


### Conclusiones – Etapa 1: Preprocesamiento y detección de Data Leakage

- El dataset cuenta con **918 registros y 12 variables**, sin valores nulos.  
- Las variables categóricas presentan categorías consistentes y sin valores faltantes.  
- Se comprobó el impacto potencial de la **fuga de datos (data leakage)**.  
  Aunque en este caso el AUC fue 1.0 en ambos escenarios, se evidenció la **importancia de realizar el escalado y demás transformaciones solo después de dividir los datos**.
- Dejamos preparado un dataset limpio y codificado con variables dummies para proceder al modelado con validación cruzada.