# Heart Disease Prediction using Random Forest

This notebook demonstrates the use of a Random Forest Classifier to predict heart disease risk using patient data. The workflow follows the CRISP-DM methodology.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Set style for visualizations
sns.set_style('darkgrid') 
sns.set_palette('husl')

In [None]:
def define_business_objectives():
    """Define and display business objectives"""
    objectives = {
        'primary_goal': 'Predict heart disease presence',
        'target_users': 'Healthcare professionals',
        'success_metrics': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
        'expected_benefits': [
            'Early disease detection',
            'Improved patient outcomes',
            'Resource optimization'
        ]
    }
    
    # Display objectives in a formatted way
    for key, value in objectives.items():
        print(f"\n{key.replace('_', ' ').title()}:")
        if isinstance(value, list):
            for item in value:
                print(f"- {item}")
        else:
            print(f"- {value}")


define_business_objectives()

In [None]:
# Load the dataset
df = pd.read_csv('heart_dataset.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

# Create visualizations
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='target')
plt.title('Distribution of Heart Disease Cases')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

# Distribution of numerical features
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
num_features = len(numerical_features)
num_rows = (num_features + 3) // 4  # Calculate the number of rows needed to accommodate all features

plt.figure(figsize=(15, 5 * num_rows))  # Adjust figure height based on the number of rows
for i, col in enumerate(numerical_features):
    plt.subplot(num_rows, 4, i + 1)  # Use num_rows instead of 3
    sns.histplot(data=df, x=col, hue='target', multiple="stack")
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Scale features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2
)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print(f"Mean CV score: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

# Train the final model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot feature importance
importance = rf_model.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=features)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.show()