In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("Dataset/heart_disease_uci.csv")

# -----------------------------
# Drop ID column if present
# -----------------------------
if "id" in df.columns:
    df.drop(columns=["id"], inplace=True)

# -----------------------------
# Replace '?' with NaN (common in UCI data)
# -----------------------------
df.replace("?", np.nan, inplace=True)

# -----------------------------
# Encode ALL categorical (object) columns
# -----------------------------
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# -----------------------------
# Handle missing values
# -----------------------------
df.fillna(df.median(numeric_only=True), inplace=True)

# -----------------------------
# Target handling
# -----------------------------
# UCI heart dataset uses 'num' as target
X = df.drop("num", axis=1)
y = df["num"].apply(lambda x: 1 if x > 0 else 0)

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -----------------------------
# Feature scaling
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Heart Disease dataset cleaned successfully")
print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)


✅ Heart Disease dataset cleaned successfully
Train shape: (736, 14)
Test shape: (184, 14)


In [3]:
import pandas as pd

y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

print("Training set distribution:")
print(y_train_series.value_counts())
print("\nTraining set percentage:")
print(y_train_series.value_counts(normalize=True) * 100)

print("\nTest set distribution:")
print(y_test_series.value_counts())
print("\nTest set percentage:")
print(y_test_series.value_counts(normalize=True) * 100)


Training set distribution:
num
1    407
0    329
Name: count, dtype: int64

Training set percentage:
num
1    55.298913
0    44.701087
Name: proportion, dtype: float64

Test set distribution:
num
1    102
0     82
Name: count, dtype: int64

Test set percentage:
num
1    55.434783
0    44.565217
Name: proportion, dtype: float64


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# -----------------------------
# Train model
# -----------------------------
heart_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)

heart_model.fit(X_train_scaled, y_train)

# -----------------------------
# Predictions
# -----------------------------
y_pred = heart_model.predict(X_test_scaled)
y_prob = heart_model.predict_proba(X_test_scaled)[:, 1]

# -----------------------------
# Evaluation
# -----------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Accuracy: 0.8532608695652174

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83        82
           1       0.85      0.89      0.87       102

    accuracy                           0.85       184
   macro avg       0.85      0.85      0.85       184
weighted avg       0.85      0.85      0.85       184

ROC-AUC: 0.905428024868484


In [5]:
import joblib

joblib.dump(heart_model, "heart_disease_model.pkl")
joblib.dump(scaler, "heart_scaler.pkl")

print("✅ Heart disease model saved successfully")


✅ Heart disease model saved successfully
