# Heart Disease Prediction using Random Forest

This notebook demonstrates the use of a Random Forest Classifier to predict heart disease risk using patient data. The workflow follows the CRISP-DM methodology.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Set style for visualizations
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette(["#2ca02c", "#1f77b4"]))

In [None]:
# Define business objectives
def define_business_objectives():
    """Define and display business objectives"""
    objectives = {
        'primary_goal': 'Predict heart disease presence',
        'target_users': 'Healthcare professionals',
        'success_metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
        'expected_benefits': [
            'Early disease detection',
            'Improved patient outcomes',
            'Resource optimization'
        ]
    }

    # Display objectives in a formatted way
    for key, value in objectives.items():
        print(f"\n{key.replace('_', ' ').title()}:")
        if isinstance(value, list):
            for item in value:
                print(f"- {item}")
        else:
            print(f"- {value}")

define_business_objectives()

In [None]:
# Load the dataset
df = pd.read_csv('heart_dataset.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Create visualizations
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='target', palette='crest')
plt.title('Distribution of Heart Disease Cases', fontsize=16)
plt.xlabel('Heart Disease Presence')
plt.ylabel('Count')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix', fontsize=16)
plt.show()

In [None]:
# Distribution of numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
num_features = len(numerical_features)
num_rows = (num_features + 3) // 4

plt.figure(figsize=(15, 5 * num_rows)) 
for i, col in enumerate(numerical_features):
    plt.subplot(num_rows, 4, i + 1)
    sns.histplot(data=df, x=col, hue='target', kde=True, multiple="stack", palette='crest')
    plt.title(f'Distribution of {col}', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
# Scale features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2
)

In [None]:
# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print(f"Mean CV score: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

In [None]:
# Train the final model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Calculate metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
# Confusion Matrix Visualization
ConfusionMatrixDisplay(conf_matrix, display_labels=rf_model.classes_).plot(cmap='Greens')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Plot feature importance
importance = rf_model.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=features, palette='crest')
plt.title('Feature Importance', fontsize=16)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()