***FIRST MODEL***


Objective:
To evaluate whether resting blood pressure and cholesterol can predict cardiovascular disease 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
df = pd.read_csv('/Users/jussaragaspar/Downloads/heart.csv')



In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df["HeartDisease"].value_counts(normalize=True)

HeartDisease
1    0.553377
0    0.446623
Name: proportion, dtype: float64

Balance check

Fairly balanced which is good for the model

In [5]:
X = df[["RestingBP", "Cholesterol"]]
y = df["HeartDisease"].astype(int)

# 2) Split data (stratify keeps class balance similar in train/test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:

from sklearn.ensemble import HistGradientBoostingClassifier
base = HistGradientBoostingClassifier(
    max_depth=3,
    learning_rate=0.05,
    max_iter=500,
    min_samples_leaf=20,
    random_state=42
)
base.fit(X_train, y_train)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(solver="lbfgs"))
])
model.fit(X_train, y_train)

In [8]:
LogisticRegression(class_weight="balanced")

In [9]:
import numpy as np
from sklearn.metrics import (roc_auc_score, average_precision_score,classification_report)
def evaluate_model(name, model, X_train, y_train, X_test, y_test, plot_curves=True):
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, proba)
    ap = average_precision_score(y_test, proba)
    pred_test = model.predict(X_test)
    print(f"Model: {name}")
    print("ROC AUC:", round(auc, 3))
    print("Avg Precision (PR AUC):", round(ap, 3))
    print("\nClassification report:\n", classification_report(y_test, pred_test))
evaluate_model("Logistic Regression", model, X_train, y_train, X_test, y_test)





Model: Logistic Regression
ROC AUC: 0.639
Avg Precision (PR AUC): 0.706

Classification report:
               precision    recall  f1-score   support

           0       0.53      0.39      0.45        82
           1       0.60      0.73      0.65       102

    accuracy                           0.58       184
   macro avg       0.57      0.56      0.55       184
weighted avg       0.57      0.58      0.56       184



**RECALL CLASS 0**  It only correctly identifies 39% of negative cases,
it falsely labels many negatives as positive.

***RECALL CLASS 1***
The model correctly detects 73% of positive cases
that’s  decent.

If you randomly pick one positive case and one negative case, the model correctly ranks them about 63,9% of the time.

With ROC being about 63% it shows that the is learning something from resting BP + cholesterol  but the signal is limited.


In [13]:
import numpy as np

thresholds = np.linspace(0.1,0.9,50)
best_f1 = 0
best_t = 0

from sklearn.metrics import f1_score

for t in thresholds:
    preds = (y_test >= t).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("Best threshold:", best_t)


Best threshold: 0.1


In [None]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, scoring="roc_auc")
imp = pd.DataFrame({
    "feature": X.columns,
    "importance_mean_drop_auc": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean_drop_auc", ascending=False)
print("\nPermutation importance (drop in ROC AUC when shuffled):\n", imp)



Permutation importance (drop in ROC AUC when shuffled):
        feature  importance_mean_drop_auc  importance_std
1  Cholesterol                  0.104846        0.035343
0    RestingBP                  0.048669        0.019076


A logistic regression model using resting blood pressure and cholesterol achieved modest discrimination (ROC AUC = 63,9%). While the model demonstrates predictive signal, performance indicates that additional clinical features are required for strong cardiovascular risk prediction.