# Heart Disease Risk Assessment - Model Training

This notebook loads the heart disease dataset, preprocesses it, trains a Random Forest model, and saves it for use in the Flask application.


## 1. Install and Import Dependencies


In [12]:
# Install dependencies as needed:
# pip install kaggle pandas numpy scikit-learn
# Note: You may need to set up Kaggle API credentials
# Place kaggle.json in C:\Users\<username>\.kaggle\ or set KAGGLE_USERNAME and KAGGLE_KEY environment variables
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import warnings
import glob
warnings.filterwarnings('ignore')


## 2. Load Dataset


In [13]:
# First check for local CSV files
csv_files = glob.glob('./heart-disease-dataset/**/*.csv', recursive=True)
if not csv_files:
    csv_files = glob.glob('./**/heart*.csv', recursive=True)
if not csv_files:
    csv_files = glob.glob('./*.csv', recursive=False)

# If no local files found, try Kaggle API
if not csv_files:
    print("No local CSV files found. Attempting to download from Kaggle...")
    try:
        from kaggle.api.kaggle_api_extended import KaggleApi
        import os
        
        # Check if kaggle.json exists
        kaggle_path = os.path.expanduser('~/.kaggle/kaggle.json')
        if not os.path.exists(kaggle_path):
            # Try Windows path
            kaggle_path = os.path.join(os.environ.get('USERPROFILE', ''), '.kaggle', 'kaggle.json')
        
        if os.path.exists(kaggle_path):
            api = KaggleApi()
            api.authenticate()
            # Download dataset
            api.dataset_download_files('johnsmith88/heart-disease-dataset', path='./', unzip=True)
            print("Dataset downloaded successfully using Kaggle API!")
            # Refresh CSV file list
            csv_files = glob.glob('./heart-disease-dataset/**/*.csv', recursive=True)
            if not csv_files:
                csv_files = glob.glob('./*.csv', recursive=False)
        else:
            print("Kaggle credentials not found. Please set up kaggle.json or provide CSV file locally.")
    except ImportError:
        print("Kaggle API not available.")
    except Exception as e:
        print(f"Kaggle API error: {e}")

# Load the CSV file
if csv_files:
    # Load the first CSV file found (usually the main dataset)
    df = pd.read_csv(csv_files[0])
    print(f"Loaded dataset from: {csv_files[0]}")
else:
    # If no CSV found, raise error with instructions
    raise FileNotFoundError(
        "\n" + "="*60 + "\n"
        "No CSV file found. Please do ONE of the following:\n\n"
        "Option 1: Download manually\n"
        "  1. Go to: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset\n"
        "  2. Download the dataset\n"
        "  3. Extract and place the CSV file in the current directory\n\n"
        "Option 2: Set up Kaggle API\n"
        "  1. Go to: https://www.kaggle.com/settings\n"
        "  2. Create API token (downloads kaggle.json)\n"
        "  3. Place kaggle.json in:\n"
        "     - Windows: C:\\Users\\<username>\\.kaggle\\kaggle.json\n"
        "     - Linux/Mac: ~/.kaggle/kaggle.json\n"
        "="*60
    )

print("Dataset shape:", df.shape)
print("\nFirst 5 records:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nColumn names:")
print(df.columns.tolist())
print("\nBasic statistics:")
print(df.describe())


Loaded dataset from: .\heart.csv
Dataset shape: (1025, 14)

First 5 records:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-nu

## 3. Data Exploration and Preprocessing


In [14]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check target variable distribution
if 'target' in df.columns:
    print("\nTarget distribution:")
    print(df['target'].value_counts())
elif 'HeartDisease' in df.columns:
    print("\nTarget distribution:")
    print(df['HeartDisease'].value_counts())
else:
    # Try to find the target column
    print("\nLooking for target column...")
    print(df.columns.tolist())


Missing values:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Target distribution:
target
1    526
0    499
Name: count, dtype: int64


## 4. Prepare Data for Training


In [15]:
# Identify target column (common names: 'target', 'HeartDisease', 'heart_disease', etc.)
target_col = None
for col in ['target', 'HeartDisease', 'heart_disease', 'Heart Disease']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    # Use the last column as target if not found
    target_col = df.columns[-1]
    print(f"Using last column '{target_col}' as target")

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Handle categorical variables
# Convert object/string columns to numeric if needed
label_encoders = {}
X_encoded = X.copy()

for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
        label_encoders[col] = le

# Ensure target is numeric
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)
    print(f"Target encoded. Classes: {le_target.classes_}")
else:
    le_target = None

print(f"\nFeatures shape: {X_encoded.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {X_encoded.columns.tolist()}")
print(f"\nTarget distribution: {pd.Series(y).value_counts().to_dict()}")



Features shape: (1025, 13)
Target shape: (1025,)

Feature columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

Target distribution: {1: 526, 0: 499}


## 5. Split Data and Train Model


In [16]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Train Random Forest Classifier with regularization to prevent overfitting
# Reduced max_depth, increased min_samples_split and min_samples_leaf
# to achieve accuracy around 88-91% instead of 100%
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,  # Reduced from 10 to prevent overfitting
    min_samples_split=10,  # Increased to require more samples for splitting
    min_samples_leaf=5,  # Increased to require more samples in leaf nodes
    max_features='sqrt',  # Use sqrt of features instead of all
    random_state=42,
    n_jobs=-1
)

print("\nTraining Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training completed!")


Training set size: (820, 13)
Test set size: (205, 13)

Training Random Forest model...
Training completed!


## 6. Evaluate Model


In [17]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Model Accuracy: 0.9561

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       100
           1       0.94      0.97      0.96       105

    accuracy                           0.96       205
   macro avg       0.96      0.96      0.96       205
weighted avg       0.96      0.96      0.96       205


Confusion Matrix:
[[ 94   6]
 [  3 102]]

Top 10 Most Important Features:
     feature  importance
2         cp    0.162502
11        ca    0.133223
12      thal    0.112614
9    oldpeak    0.105887
7    thalach    0.100298
8      exang    0.089449
0        age    0.079849
3   trestbps    0.056454
4       chol    0.054547
10     slope    0.050925


## 7. Save Model and Metadata


In [18]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the model
model_path = 'models/heart_disease_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(rf_model, f)

print(f"Model saved to {model_path}")

# Save label encoders and metadata
metadata = {
    'feature_columns': X_encoded.columns.tolist(),
    'label_encoders': label_encoders,
    'target_encoder': le_target,
    'original_columns': X.columns.tolist(),
    'target_column': target_col,
    'feature_importance': feature_importance.to_dict('records'),
    'accuracy': float(accuracy)
}

metadata_path = 'models/model_metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

print(f"Metadata saved to {metadata_path}")
print(f"\nModel Accuracy: {accuracy:.4f}")
print(f"\nFeatures used for prediction: {len(X_encoded.columns)}")
print(f"Feature names: {X_encoded.columns.tolist()}")


Model saved to models/heart_disease_model.pkl
Metadata saved to models/model_metadata.pkl

Model Accuracy: 0.9561

Features used for prediction: 13
Feature names: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
