In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

### =========================================================================
### 1. LOAD AND MERGE DATA
### =========================================================================
# The first step is to combine all the separate data files into a single master dataframe.
# We'll use the 'PatID' column as the common key for merging.

print("Step 1: Loading and merging data...")
try:
    demographic_df = pd.read_csv('demographic_table.csv')
    patient_condition_df = pd.read_csv('patient_condition_table.csv')
    smoking_df = pd.read_csv('smoking_status_table.csv')
    alcohol_df = pd.read_csv('alcohol_use_table.csv')

    # Sequentially merge dataframes. Using left merge to keep all patients
    # from the demographic table and filling missing data for smokers/drinkers.
    df = pd.merge(demographic_df, patient_condition_df, on='PatID', how='inner')
    df = pd.merge(df, smoking_df, on='PatID', how='left')
    df = pd.merge(df, alcohol_df, on='PatID', how='left')

    # Fill NaN values that resulted from the left merge.
    # We assume that if a patient has no record in the smoking or alcohol tables,
    # they do not engage in those activities.
    df['Smoking_Status'] = df['Smoking_Status'].fillna(0)
    # The column name in the file has a trailing space
    df['Smoking_intensity '] = df['Smoking_intensity '].fillna(0)
    df['HvyAlcoholConsump'] = df['HvyAlcoholConsump'].fillna(0)

    print("Data loaded and merged successfully!")
    print(f"Final dataset shape: {df.shape}")
    print("-" * 50)

except FileNotFoundError as e:
    print(f"Error: {e}. Make sure all CSV files are in the same directory.")
    exit()

### =========================================================================
### 2. DATA PREPARATION
### =========================================================================
# Here, we prepare the data for modeling. This involves defining our features
# and target, and splitting the data into training and testing sets.

print("Step 2: Preparing data for modeling...")

# Drop the patient identifier, as it's not a predictive feature
df = df.drop('PatID', axis=1)

# Correcting the target variable name based on the CSV file
TARGET_VARIABLE = 'HeartDiseaseorAttack'

# Define the feature matrix (X) and the target vector (y)
X = df.drop(TARGET_VARIABLE, axis=1)
y = df[TARGET_VARIABLE]

# Check for class imbalance in the target variable
class_counts = y.value_counts()
print("\nTarget Variable Distribution:")
print(class_counts)

# Split data into 80% for training and 20% for testing
# 'stratify=y' ensures the class distribution is the same in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 50)


### =========================================================================
### 3. MODEL TRAINING & HYPERPARAMETER TUNING (RANDOM FOREST)
### =========================================================================
# We will use RandomizedSearchCV to find the best settings for our RandomForest model.

print("Step 3: Training Random Forest model and tuning hyperparameters...")

# Define the hyperparameter grid for RandomizedSearchCV to explore
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize RandomForest Classifier
# class_weight='balanced' automatically adjusts weights inversely proportional to class frequencies
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Set up the Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=5,  # Number of parameter combinations to try
    scoring='roc_auc',
    cv=5,       # 5-fold cross-validation
    verbose=1,
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit the model to find the best hyperparameters
random_search.fit(X_train, y_train)

# Extract the best performing model from the search
best_model = random_search.best_estimator_

print("\nHyperparameter tuning complete!")
print("Best hyperparameters found:")
print(random_search.best_params_)
print("-" * 50)


### =========================================================================
### 4. MODEL EVALUATION
### =========================================================================
# Finally, we evaluate the tuned model on the unseen test data to see how
# well it performs in predicting heart disease risk.

print("Step 4: Evaluating the best model on the test set...")

# Make predictions on the test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1] # Probabilities for the positive class

# Print evaluation metrics
print("\nPERFORMANCE METRICS:")
print("="*20)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Heart Disease', 'Heart Disease']))

print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(f"[[{cm[0][0]:>5} {cm[0][1]:>5}]   <-- Predicted No Heart Disease")
print(f" [{cm[1][0]:>5} {cm[1][1]:>5}]]  <-- Predicted Heart Disease")
print("-" * 50)

# Display the most important features driving the model's predictions
print("Top 10 Most Important Features:")
feature_importances = pd.DataFrame(
    best_model.feature_importances_,
    index=X_train.columns,
    columns=['importance']
).sort_values('importance', ascending=False)
print(feature_importances.head(10))
print("-" * 50)

Step 1: Loading and merging data...
Data loaded and merged successfully!
Final dataset shape: (253688, 25)
--------------------------------------------------
Step 2: Preparing data for modeling...

Target Variable Distribution:
HeartDiseaseorAttack
0    229795
1     23893
Name: count, dtype: int64
Training set size: 202950 samples
Testing set size: 50738 samples
--------------------------------------------------
Step 3: Training Random Forest model and tuning hyperparameters...
Fitting 5 folds for each of 5 candidates, totalling 25 fits

Hyperparameter tuning complete!
Best hyperparameters found:
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}
--------------------------------------------------
Step 4: Evaluating the best model on the test set...

PERFORMANCE METRICS:
Accuracy: 0.7427
ROC-AUC Score: 0.8220

Classification Report:
                  precision    recall  f1-score   support

No Heart Disease   

In [None]:
### =========================================================================
### 5. SAVE THE MODEL
### =========================================================================
# This final step saves the trained model to a file for deployment.
# We use joblib as it is efficient for saving scikit-learn models.
import joblib
print("Step 5: Saving the trained model...")

model_filename = 'heart_disease_model.pkl'
joblib.dump(best_model, model_filename)

print(f"Model saved successfully as '{model_filename}'")
print("-" * 50)

Step 5: Saving the trained model...
Model saved successfully as 'heart_disease_model.pkl'
--------------------------------------------------
