# Healthcare - Heart Disease Prediction

This notebook demonstrates machine learning for predicting heart disease based on patient characteristics and medical measurements.

## Objectives:
1. Load and explore the Heart Disease UCI dataset
2. Perform comprehensive EDA with clinical context
3. Engineer features based on medical knowledge
4. Build and compare multiple ML models
5. Evaluate and interpret results in clinical context


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, roc_curve)
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Data Loading

We'll use the Heart Disease UCI dataset. This dataset contains 14 attributes including demographics, medical history, and test results.


In [None]:
# Load the dataset
# Note: In practice, download from UCI ML Repository
# URL: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

np.random.seed(42)
n_samples = 303

data = {
    'age': np.random.randint(29, 78, n_samples),
    'sex': np.random.choice([0, 1], n_samples),
    'cp': np.random.choice([0, 1, 2, 3], n_samples),
    'trestbps': np.random.randint(94, 200, n_samples),
    'chol': np.random.randint(126, 564, n_samples),
    'fbs': np.random.choice([0, 1], n_samples),
    'restecg': np.random.choice([0, 1, 2], n_samples),
    'thalach': np.random.randint(71, 202, n_samples),
    'exang': np.random.choice([0, 1], n_samples),
    'oldpeak': np.random.uniform(0, 6.2, n_samples),
    'slope': np.random.choice([0, 1, 2], n_samples),
    'ca': np.random.choice([0, 1, 2, 3], n_samples),
    'thal': np.random.choice([1, 2, 3], n_samples),
}

df = pd.DataFrame(data)

heart_disease_prob = (
    0.1 + 0.15 * (df['age'] > 60) + 0.1 * (df['sex'] == 1) +
    0.2 * (df['cp'] >= 2) + 0.15 * (df['trestbps'] > 140) +
    0.15 * (df['chol'] > 240) + 0.1 * (df['exang'] == 1) +
    0.15 * (df['oldpeak'] > 1.5) + 0.1 * (df['thalach'] < 120) +
    np.random.normal(0, 0.1, n_samples)
)
df['target'] = (heart_disease_prob > 0.5).astype(int)

print(f"Dataset shape: {df.shape}")
df.head()


## 2. Exploratory Data Analysis (EDA)

In [None]:
# Basic info and target distribution
print(f"Shape: {df.shape}")
print(f"
Target distribution:")
print(df['target'].value_counts(normalize=True))

plt.figure(figsize=(8, 5))
df['target'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Heart Disease Distribution')
plt.xlabel('Target (0=No Disease, 1=Disease)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 3. Feature Engineering

In [None]:
# Create age groups and clinical thresholds
df_processed = df.copy()
df_processed['age_group'] = pd.cut(df_processed['age'], bins=[0, 40, 50, 60, 100], labels=['<40', '40-50', '50-60', '>60'])
df_processed['high_bp'] = (df_processed['trestbps'] > 140).astype(int)
df_processed['high_chol'] = (df_processed['chol'] > 240).astype(int)
df_processed['low_hr'] = (df_processed['thalach'] < 120).astype(int)
df_processed['high_st_depression'] = (df_processed['oldpeak'] > 1.5).astype(int)
df_processed = pd.get_dummies(df_processed, columns=['cp', 'restecg', 'slope', 'thal', 'age_group'], drop_first=True)

X = df_processed.drop('target', axis=1)
y = df_processed['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

## 4. Machine Learning Models

In [None]:
# Train models
models = {}
predictions = {}

# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]
models['LR'] = (y_pred_lr, y_pred_proba_lr)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]
models['RF'] = (y_pred_rf, y_pred_proba_rf)

# SVM
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
y_pred_proba_svm = svm.predict_proba(X_test_scaled)[:, 1]
models['SVM'] = (y_pred_svm, y_pred_proba_svm)

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
y_pred_proba_gb = gb.predict_proba(X_test)[:, 1]
models['GB'] = (y_pred_gb, y_pred_proba_gb)

# Evaluate
results = []
for name, (y_pred, y_pred_proba) in models.items():
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
    })
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

## 5. Summary

Key findings and clinical implications discussed in the tutorial.