In [1]:
# STEP 1: IMPORT LIBRARIES
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# STEP 2: LOAD AND INSPECT THE HEART DISEASE DATA
file_path = '../data/heart_disease.csv'
df = pd.read_csv(file_path)

print("\n--- First 5 rows of the heart disease dataset ---")
print(df.head())

print("\n--- Missing Values ---")
print(df.isnull().sum())
print(f"Total missing values: {df.isnull().sum().sum()}")


--- First 5 rows of the heart disease dataset ---
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  

--- Missing Values ---
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
Total missing values: 0


In [3]:
# STEP 3: SEPARATE FEATURES (X) AND TARGET (y)
X = df.drop('target', axis=1)
y = df['target']

In [4]:
# STEP 4: SPLIT DATA INTO TRAINING AND TESTING SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# STEP 5: FEATURE SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# STEP 6: TRAIN THE MODEL
print("\n--- Training the heart disease model... ---")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
print("Model training complete!")


--- Training the heart disease model... ---
Model training complete!


In [None]:
# STEP 7: EVALUATE THE MODEL
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Heart Disease Model Performance ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [7]:
# STEP 8: SAVE THE NEW MODEL AND SCALER
model_path = '../saved_models/heart_model.joblib'
scaler_path = '../saved_models/heart_scaler.joblib'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"\nHeart disease model saved to: {model_path}")
print(f"Heart disease scaler saved to: {scaler_path}")


Heart disease model saved to: ../saved_models/heart_model.joblib
Heart disease scaler saved to: ../saved_models/heart_scaler.joblib
