In [1]:
# Cell 1: Import Libraries
%pip install seaborn

import pandas as pd
import numpy as np
import seaborn as sns # Often used for data exploration, though not strictly needed for the core ML task
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # Using RandomForest as it performed well
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib




# Once uploaded, read the file:
df = pd.read_csv("heart_failure_clinical_records_dataset (1).csv")


print("First 5 rows of the dataset:")
print(df.head())

print("\nInformation about the dataset:")
print(df.info())

print("\nDescriptive statistics of the dataset:")
print(df.describe())

# Check for missing values (already confirmed none in previous steps, but good practice)
print("\nMissing values per column:")
print(df.isnull().sum())


# Cell 4: Prepare Data for Training
# Separate features (X) and target (y)
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

# Split the data into training and testing sets
# stratify=y ensures that the proportion of target variable is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


# Cell 5: Feature Scaling
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# You can also save the column names for future reference if needed for visualization or interpretation
feature_names = X.columns.tolist()
print("\nFeatures scaled successfully.")


# Cell 6: Train the Machine Learning Model
# Initialize the RandomForestClassifier
# n_estimators: number of trees in the forest
# random_state: for reproducibility
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)

print("\nRandomForestClassifier model trained successfully.")


# Cell 7: Evaluate the Model
# Make predictions on the scaled test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Display other evaluation metrics (optional but recommended)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Cell 8: Save the Trained Model and Scaler
# Define file paths to save

model_filename = 'heart_failure_model.pkl'
scaler_filename = 'scaler.pkl'            


# Save the model and scaler using joblib
joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"\nModel saved as '{model_filename}'")
print(f"Scaler saved as '{scaler_filename}'")


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
First 5 rows of the dataset:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smokin