# Heart Disease Risk Assessment - Model Training

This notebook loads the heart disease dataset, preprocesses it, trains a Random Forest model, and saves it for use in the Flask application.


## 1. Install and Import Dependencies


In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import warnings
warnings.filterwarnings('ignore')


## 2. Load Dataset


In [None]:
# Load the latest version of the dataset
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "johnsmith88/heart-disease-dataset",
    "",  # Empty string to load all files or specify file path
)

print("Dataset shape:", df.shape)
print("\nFirst 5 records:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nColumn names:")
print(df.columns.tolist())
print("\nBasic statistics:")
print(df.describe())


## 3. Data Exploration and Preprocessing


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check target variable distribution
if 'target' in df.columns:
    print("\nTarget distribution:")
    print(df['target'].value_counts())
elif 'HeartDisease' in df.columns:
    print("\nTarget distribution:")
    print(df['HeartDisease'].value_counts())
else:
    # Try to find the target column
    print("\nLooking for target column...")
    print(df.columns.tolist())


## 4. Prepare Data for Training


In [None]:
# Identify target column (common names: 'target', 'HeartDisease', 'heart_disease', etc.)
target_col = None
for col in ['target', 'HeartDisease', 'heart_disease', 'Heart Disease']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    # Use the last column as target if not found
    target_col = df.columns[-1]
    print(f"Using last column '{target_col}' as target")

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Handle categorical variables
# Convert object/string columns to numeric if needed
label_encoders = {}
X_encoded = X.copy()

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
        label_encoders[col] = le

# Ensure target is numeric
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)
    print(f"Target encoded. Classes: {le_target.classes_}")
else:
    le_target = None

print(f"\nFeatures shape: {X_encoded.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {X_encoded.columns.tolist()}")
print(f"\nTarget distribution: {pd.Series(y).value_counts().to_dict()}")


## 5. Split Data and Train Model


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

print("\nTraining Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training completed!")


## 6. Evaluate Model


In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


## 7. Save Model and Metadata


In [None]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the model
model_path = 'models/heart_disease_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(rf_model, f)

print(f"Model saved to {model_path}")

# Save label encoders and metadata
metadata = {
    'feature_columns': X_encoded.columns.tolist(),
    'label_encoders': label_encoders,
    'target_encoder': le_target,
    'original_columns': X.columns.tolist(),
    'target_column': target_col,
    'feature_importance': feature_importance.to_dict('records'),
    'accuracy': float(accuracy)
}

metadata_path = 'models/model_metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

print(f"Metadata saved to {metadata_path}")
print(f"\nModel Accuracy: {accuracy:.4f}")
print(f"\nFeatures used for prediction: {len(X_encoded.columns)}")
print(f"Feature names: {X_encoded.columns.tolist()}")
