# Heart Attack Prediction Notebook
This notebook implements the full workflow for predicting heart attack risk using the US Heart Patients dataset.

End-to-end pipeline: EDA → preprocessing → Decision Tree (GridSearch) → evaluation → SHAP → artifacts
Random seed = 13

## 1. Data Loading

In [None]:
%pip install numpy

import pandas as pd
import numpy as np
RANDOM_SEED = 13
np.random.seed(RANDOM_SEED)

# Load data
df = pd.read_csv('../data/US_Heart_Patients.csv')
df.shape, df.columns.tolist()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# First 10 rows
df.head(10)

In [None]:
# 5-point summary
df.describe().T

In [None]:
# Column info
df.info()

In [None]:
# Data types
df.dtypes

In [None]:
# Missing values
df.isnull().sum()

In [None]:
# Outlier detection using IQR
outlier_counts = {}
for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    outlier_counts[col] = outliers
outlier_counts

In [None]:
# Correlation matrix
corr = df.corr(numeric_only=True)
corr

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Distribution plots
num_cols = df.select_dtypes(include=[np.number]).columns
fig, axes = plt.subplots(len(num_cols)//3+1, 3, figsize=(15, 4*len(num_cols)//3))
axes = axes.flatten()
for i, col in enumerate(num_cols):
    sns.histplot(df[col], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Boxplots for outliers
fig, axes = plt.subplots(len(num_cols)//3+1, 3, figsize=(15, 4*len(num_cols)//3))
axes = axes.flatten()
for i, col in enumerate(num_cols):
    sns.boxplot(x=df[col], ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify features
target_col = 'Heart-Att'
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode target if categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    y = LabelEncoder().fit_transform(y.astype(str))

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

# Impute missing values
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
# Outlier capping (Winsorization)
def cap_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series.clip(lower, upper)

for col in numeric_features:
    X[col] = cap_outliers(X[col])

## 4. Data Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)
X_train.shape, X_test.shape

## 5. Model Preparation & Evaluation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=RANDOM_SEED))
])

param_grid = {
    'classifier__max_depth': [3, 5, 7, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

In [None]:
# Evaluation
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)
print('F1 Score (Train):', f1_score(y_train, y_pred_train))
print('F1 Score (Test):', f1_score(y_test, y_pred_test))
print('Confusion Matrix (Test):', confusion_matrix(y_test, y_pred_test))
print('Classification Report (Test):', classification_report(y_test, y_pred_test))

### Model Summary
- The best Decision Tree model was selected using GridSearchCV with 5-fold cross-validation.
- Hyperparameters tuned: max_depth, min_samples_split, min_samples_leaf.
- F1 score, confusion matrix, and classification report are provided for the test set.
- Feature importances and further analysis can be added as needed.

## 6. Model Artifacts

In [None]:
import joblib
joblib.dump(best_model, '../models/decision_tree_model.pkl')
print('✅ Model saved to ../models/decision_tree_model.pkl')

## 7. Inference API (see src/app.py)
A Flask API is provided in `src/app.py` to serve the trained model for real-time predictions.

- Endpoint: `/predict` (POST)
- Input: Patient data as JSON
- Output: Prediction (risk of heart attack)

See the Python file for implementation details.