Data Loading 

In [20]:
import pandas as pd

df = pd.read_csv("hospital_readmissions.csv")
df.shape



(25000, 17)

Encoding

In [21]:
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.shape


(25000, 46)

X and y


In [22]:
X = df_encoded.drop("readmitted_yes", axis=1)
y = df_encoded["readmitted_yes"]

X.shape, y.shape


((25000, 45), (25000,))

Train-test split

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6088
[[2074  575]
 [1381  970]]
              precision    recall  f1-score   support

       False       0.60      0.78      0.68      2649
        True       0.63      0.41      0.50      2351

    accuracy                           0.61      5000
   macro avg       0.61      0.60      0.59      5000
weighted avg       0.61      0.61      0.59      5000



Random Forest (optional)

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Accuracy: 0.5984
[[1822  827]
 [1181 1170]]
              precision    recall  f1-score   support

       False       0.61      0.69      0.64      2649
        True       0.59      0.50      0.54      2351

    accuracy                           0.60      5000
   macro avg       0.60      0.59      0.59      5000
weighted avg       0.60      0.60      0.59      5000



Threshold tuning (for Logistic Regression)

In [26]:
from sklearn.metrics import f1_score, precision_score, recall_score

proba = lr.predict_proba(X_test)[:, 1]

for t in [0.3, 0.4, 0.5, 0.6]:
    pred = (proba >= t).astype(int)
    print("Threshold:", t,
          "Precision:", round(precision_score(y_test, pred), 2),
          "Recall:", round(recall_score(y_test, pred), 2),
          "F1:", round(f1_score(y_test, pred), 2))


Threshold: 0.3 Precision: 0.48 Recall: 0.99 F1: 0.65
Threshold: 0.4 Precision: 0.54 Recall: 0.75 F1: 0.63
Threshold: 0.5 Precision: 0.63 Recall: 0.41 F1: 0.5
Threshold: 0.6 Precision: 0.69 Recall: 0.22 F1: 0.34


In [27]:
threshold = 0.4
y_pred_thr = (proba >= threshold).astype(int)


Final threshold selected = 0.4 to balance precision and recall and capture more readmission-risk patients.
