In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
df = pd.read_csv("heart_2020_cleaned.csv")
print("✅ Loaded:", df.shape)
df.head()


In [None]:
df["HeartDisease"] = df["HeartDisease"].map({"Yes": 1, "No": 0})
df["HeartDisease"].value_counts()


In [None]:
binary_cols = [
    "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking",
    "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"
]

for col in binary_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

df[binary_cols].head()


In [None]:
df["Sex"] = df["Sex"].map({"Male": 1, "Female": 0})
df["Sex"].value_counts()


In [None]:
genhealth_map = {
    "Poor": 0,
    "Fair": 1,
    "Good": 2,
    "Very good": 3,
    "Excellent": 4
}
df["GenHealth"] = df["GenHealth"].map(genhealth_map)

df["GenHealth"].value_counts().sort_index()


In [None]:
age_order = [
    "18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
    "50-54", "55-59", "60-64", "65-69", "70-74", "75-79",
    "80 or older"
]
age_map = {v: i for i, v in enumerate(age_order)}

df["AgeCategory"] = df["AgeCategory"].map(age_map)

df["AgeCategory"].value_counts().sort_index()


In [None]:
diabetic_map = {
    "No": 0,
    "No, borderline diabetes": 1,
    "Yes (during pregnancy)": 2,
    "Yes": 3
}
df["Diabetic"] = df["Diabetic"].map(diabetic_map)

df["Diabetic"].value_counts().sort_index()


In [None]:
numeric_cols = ["BMI", "PhysicalHealth", "MentalHealth", "SleepTime"]
nominal_cols = ["Race"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("race", OneHotEncoder(handle_unknown="ignore", sparse_output=False), nominal_cols),
    ],
    remainder="passthrough"
)


In [None]:
y = df["HeartDisease"]
X = df.drop(columns=["HeartDisease"])

X_processed = preprocessor.fit_transform(X)

print("✅ X_processed shape:", X_processed.shape)


In [None]:
race_feature_names = preprocessor.named_transformers_["race"].get_feature_names_out(nominal_cols)

new_columns = (
    numeric_cols
    + list(race_feature_names)
    + [c for c in X.columns if c not in numeric_cols + nominal_cols]
)

df_processed = pd.DataFrame(X_processed, columns=new_columns)

# Gắn lại label vào đầu
df_processed.insert(0, "HeartDisease", y.values)

df_processed.head()


In [None]:
OUTPUT_FILE = "heart_2020_processed.csv"
df_processed.to_csv(OUTPUT_FILE, index=False)

print(f"✅ Đã chuẩn hóa xong và lưu ra file: {OUTPUT_FILE}")
print("Shape dữ liệu mới:", df_processed.shape)
