In [1]:
# load and prepare data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('heart_2020_uncleaned.csv')

# drop rows with missing values
df_clean = df.dropna()

# Encode the target variable: HeartDisease (Yes = 1, No = 0)
df_clean["HeartDisease"] = df_clean["HeartDisease"].map({"Yes": 1, "No": 0})

# Encode binary categorical columns using LabelEncoder
binary_cols = ["Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Sex",
               "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"
               ]

le = LabelEncoder()
for col in binary_cols:
    df_clean[col] = le.fit_transform(df_clean[col])

# One-hot encode multi class categorical columns
df_encoded = pd.get_dummies(
    df_clean,
    columns=["Diabetic", "GenHealth", "AgeCategory", "Race"],
    drop_first=True # avoid dummy variable trap
)

# check the result
print("Cleaned & Encoded Dataset Shape:", df_encoded.shape)
print(df_encoded.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["HeartDisease"] = df_clean["HeartDisease"].map({"Yes": 1, "No": 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = le.fit_transform(df_clean[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col] = le.fit_transform(df_clean[col])
A value is trying to be set on a

Cleaned & Encoded Dataset Shape: (319426, 38)
   HeartDisease    BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  \
0             0  16.60        1                0       0             3.0   
1             0  20.34        0                0       1             0.0   
2             0  26.58        1                0       0            20.0   
3             0  24.21        0                0       0             0.0   
4             0  23.71        0                0       0            28.0   

   MentalHealth  DiffWalking  Sex  PhysicalActivity  ...  AgeCategory_60-64  \
0          30.0            0    0                 1  ...              False   
1           0.0            0    0                 1  ...              False   
2          30.0            0    1                 1  ...              False   
3           0.0            0    0                 0  ...              False   
4           0.0            1    0                 1  ...              False   

   AgeCategory_65-69  

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

x = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

# split data
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

# train model
logreg_model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')
logreg_model.fit(X_train, y_train)

# evaluate
y_pred = logreg_model.predict(X_test)
y_proba = logreg_model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score: ", roc_auc_score(y_test, y_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.75      0.84     58418
           1       0.22      0.78      0.35      5468

    accuracy                           0.75     63886
   macro avg       0.60      0.76      0.60     63886
weighted avg       0.91      0.75      0.80     63886

ROC AUC Score:  0.8426690208920635


In [3]:
# save the trained model
import joblib

joblib.dump(logreg_model, "heart_disease_model.pkl")

# save column names
joblib.dump(X_train.columns.tolist(), "model_features.pkl")

['model_features.pkl']