In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn joblib

Import the libraries needed

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix)
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
import joblib

Step 1: Load dataset

In [None]:
df = pd.read_csv('heart_failure_clinical_records.csv')
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nClass Balance:\n", df['DEATH_EVENT'].value_counts(normalize=True))
print("\nSummary Statistics:\n", df.describe())

Step 2: data preprocessing

In [None]:
numeric_cols = ['age', 'creatinine_phosphokinase', 'platelets', 'serum_creatinine', 
                'serum_sodium', 'ejection_fraction', 'time']

for col in numeric_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)
    
    if df[col].skew() > 1 and (df[col] > 0).all():
        df[col] = np.log1p(df[col])
        print(f"Log-transformed {col} due to skewness: {df[col].skew():.2f}")

Step 3: Feature engineering

In [7]:
# Bin age groups
bins = [0, 50, 70, 120]
df['age_group'] = pd.cut(df['age'], bins=bins, labels=['YOUNG', 'MID', 'OLD'], include_lowest=True)
df = df.drop('age', axis=1)

# Creatinine/ejection fraction ratio
df['creatinine_ejection_ratio'] = df['serum_creatinine'] / df['ejection_fraction'].replace(0, np.finfo(float).eps)

# Bin time groups
time_bins = [0, df['time'].quantile(0.33), df['time'].quantile(0.66), df['time'].max()]
time_labels = ['SHORT', 'MEDIUM', 'LONG']
df['time_group'] = pd.cut(df['time'], bins=time_bins, labels=time_labels, include_lowest=True)
df = df.drop('time', axis=1)

# Comorbidity count
df['comorbidity_count'] = df[['diabetes', 'high_blood_pressure', 'anaemia']].sum(axis=1)


# Step 4: Categorical encoding
df = pd.get_dummies(df, columns=['age_group', 'time_group'], prefix=['age_group', 'time_group'])


Step 4: Train/test split

In [None]:
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("\nTraining Set Size:", X_train.shape[0], "rows")
print("Test Set Size:", X_test.shape[0], "rows")

Step 5: Scale numeric features

In [10]:
numeric_cols = ['creatinine_phosphokinase', 'platelets', 'serum_creatinine', 
                'serum_sodium', 'ejection_fraction', 'creatinine_ejection_ratio']
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_cols])
X_test_numeric = scaler.transform(X_test[numeric_cols])

Step 6: Combine features

In [11]:
binary_cols = ['sex', 'smoking', 'diabetes', 'anaemia', 'high_blood_pressure', 'comorbidity_count']
categorical_cols = [col for col in df.columns if col.startswith('age_group') or col.startswith('time_group')]
X_train = np.hstack([X_train_numeric, X_train[binary_cols].values, X_train[categorical_cols].values])
X_test = np.hstack([X_test_numeric, X_test[binary_cols].values, X_test[categorical_cols].values])
feature_names = numeric_cols + binary_cols + categorical_cols

Step 7: Handle class imbalance with SMOTE, feature selection with RFE

In [None]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
print("\nAfter SMOTE - Class Balance:\n", pd.Series(y_train_balanced).value_counts())

# Step 9: Feature selection with RFE
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=8)
X_train_balanced = rfe.fit_transform(X_train_balanced, y_train_balanced)
X_test = rfe.transform(X_test)

selected_features = [feature_names[i] for i in range(len(feature_names)) if rfe.support_[i]]
print("\nSelected Features:", selected_features)

Step 8: Train the model and print result

In [None]:
model.fit(X_train_balanced, y_train_balanced)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))

Perform cross validation to get mean F1 score

In [None]:
cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
print("\n5-Fold CV F1-Scores:", cv_scores)
print("Mean CV F1-Score:", cv_scores.mean())



Feature importance with coefficients

In [None]:
coefficients = pd.DataFrame({'Feature': selected_features, 'Coefficient': model.coef_[0]})
print("\nFeature Importance (Coefficients):\n", coefficients.sort_values(by='Coefficient', ascending=False))

View ROC Curve and confusion matrix

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_score(y_test, y_pred_proba):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Save LR model with joblib

In [None]:
joblib.dump(model, 'logistic_regression_model.pkl')