In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Load Processed Data

In [2]:
df = pd.read_csv("../data/processed/heart_cleaned.csv")
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,age_squared,chol_age_ratio,bp_age_ratio,cardiac_stress_score,vessel_severity,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,2704,4.076923,2.403846,1.005917,6,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,2809,3.830189,2.641509,4.10641,0,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,4900,2.485714,2.071429,3.607937,0,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,3721,3.327869,2.42623,0.006173,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,3844,4.741935,2.225806,1.909346,6,0


# Split Features & Target

In [3]:
X = df.drop("target", axis=1)
y = df["target"]


# Train / Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# Feature Scaling

In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Model 1: Logistic Regression

* Train Logistic Regression

In [18]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


* Evaluate Logistic Regression

In [19]:
lr_preds = lr_model.predict(X_test_scaled)

print("Logistic Regression Accuracy:",
      accuracy_score(y_test, lr_preds))

print(confusion_matrix(y_test, lr_preds))
print(classification_report(y_test, lr_preds))


Logistic Regression Accuracy: 0.824390243902439
[[74 26]
 [10 95]]
              precision    recall  f1-score   support

           0       0.88      0.74      0.80       100
           1       0.79      0.90      0.84       105

    accuracy                           0.82       205
   macro avg       0.83      0.82      0.82       205
weighted avg       0.83      0.82      0.82       205



# Model 2: Random Forest

* Train Random Forest

In [20]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)


* Evaluate Random Forest

In [25]:
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)

print("Random Forest Accuracy:",
      accuracy_score(y_test, rf_preds))

print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds))


Random Forest Accuracy: 0.9219512195121952
[[ 89  11]
 [  5 100]]
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       100
           1       0.90      0.95      0.93       105

    accuracy                           0.92       205
   macro avg       0.92      0.92      0.92       205
weighted avg       0.92      0.92      0.92       205



“Constraining the Random Forest reduced overfitting and improved the model’s generalization, resulting in a more realistic accuracy of 92%.”

# Cross Validation

In [26]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    rf_model, X, y, cv=5, scoring="accuracy"
)

print("CV Accuracy Mean:", cv_scores.mean())
print("CV Accuracy Std:", cv_scores.std())


CV Accuracy Mean: 0.9151219512195121
CV Accuracy Std: 0.018917775053332003


## Model Training Summary

- Logistic Regression was used as a baseline model.
- Random Forest achieved higher performance by capturing non-linear relationships.
- The comparison highlights the impact of engineered risk-based features.
- The selected model will be used for patient risk scoring.


## Overfitting Analysis

- Initial Random Forest model achieved perfect accuracy, indicating overfitting.
- Model complexity was reduced using depth and sample constraints.
- Cross-validation was applied to estimate real-world performance.
- The final model demonstrates better generalization and reliability.
