In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
print("=" * 50)
print("STEP 1: Loading Data")
print("=" * 50)

df = pd.read_csv('hypertension_dataset_cleaned.csv')
print(f"Dataset shape: {df.shape}")
print(df.head(3))

STEP 1: Loading Data
Dataset shape: (1985, 11)
   Age  Salt_Intake  Stress_Score BP_History  Sleep_Duration   BMI Medication  \
0   69          8.0             9     Normal             6.4  25.8        NaN   
1   32         11.7            10     Normal             5.4  23.4        NaN   
2   78          9.5             3     Normal             7.1  18.7        NaN   

  Family_History Exercise_Level Smoking_Status Has_Hypertension  
0            Yes            Low     Non-Smoker              Yes  
1             No            Low     Non-Smoker               No  
2             No       Moderate     Non-Smoker               No  


In [3]:
print("\n" + "=" * 50)
print("STEP 2: Handling Missing Values")
print("=" * 50)

df['Medication'] = df['Medication'].fillna('None')
print(f"Missing values after fix: {df.isnull().sum().sum()}")


STEP 2: Handling Missing Values
Missing values after fix: 0


In [4]:
print("\n" + "=" * 50)
print("STEP 3: Encoding Categorical Columns")
print("=" * 50)

categorical_cols = ['BP_History', 'Medication', 'Family_History',
                    'Exercise_Level', 'Smoking_Status']

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le
    print(f"  {col}: {list(le.classes_)}")

target_encoder = LabelEncoder()
df['Has_Hypertension'] = target_encoder.fit_transform(df['Has_Hypertension'].astype(str))
print(f"  Target: {list(target_encoder.classes_)} → {list(range(len(target_encoder.classes_)))}")



STEP 3: Encoding Categorical Columns
  BP_History: ['Hypertension', 'Normal', 'Prehypertension']
  Medication: ['ACE Inhibitor', 'Beta Blocker', 'Diuretic', 'None', 'Other']
  Family_History: ['No', 'Yes']
  Exercise_Level: ['High', 'Low', 'Moderate']
  Smoking_Status: ['Non-Smoker', 'Smoker']
  Target: ['No', 'Yes'] → [0, 1]


In [5]:
print("\n" + "=" * 50)
print("STEP 4: Splitting Features and Target")
print("=" * 50)

X = df.drop('Has_Hypertension', axis=1)
y = df['Has_Hypertension']

print(f"Features: {list(X.columns)}")
print(f"Target distribution:\n{y.value_counts()}")


STEP 4: Splitting Features and Target
Features: ['Age', 'Salt_Intake', 'Stress_Score', 'BP_History', 'Sleep_Duration', 'BMI', 'Medication', 'Family_History', 'Exercise_Level', 'Smoking_Status']
Target distribution:
Has_Hypertension
1    1032
0     953
Name: count, dtype: int64


In [6]:
print("\n" + "=" * 50)
print("STEP 5: Train/Test Split (80/20)")
print("=" * 50)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples:  {X_test.shape[0]}")


STEP 5: Train/Test Split (80/20)
Training samples: 1588
Testing samples:  397


In [7]:
print("\n" + "=" * 50)
print("STEP 6: Scaling Features")
print("=" * 50)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
print("Features scaled using StandardScaler.")


STEP 6: Scaling Features
Features scaled using StandardScaler.


In [8]:
print("\n" + "=" * 50)
print("STEP 7: Training Random Forest Model")
print("=" * 50)

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_leaf=28,
    random_state=42
)
model.fit(X_train_scaled, y_train)
print("Model training complete!")


STEP 7: Training Random Forest Model
Model training complete!


In [9]:
print("\n" + "=" * 50)
print("STEP 8: Model Evaluation")
print("=" * 50)

train_acc = accuracy_score(y_train, model.predict(X_train_scaled))
test_acc  = accuracy_score(y_test,  model.predict(X_test_scaled))

print(f"\nTrain Accuracy: {train_acc * 100:.2f}%")
print(f"Test Accuracy:  {test_acc  * 100:.2f}%")
print(f"Difference:     {(train_acc - test_acc) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test_scaled), target_names=target_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test_scaled)))

importances = pd.Series(model.feature_importances_, index=X.columns)
print("\nTop Feature Importances:")
print(importances.sort_values(ascending=False))


STEP 8: Model Evaluation

Train Accuracy: 94.02%
Test Accuracy:  92.95%
Difference:     1.07%

Classification Report:
              precision    recall  f1-score   support

          No       0.91      0.95      0.93       191
         Yes       0.95      0.91      0.93       206

    accuracy                           0.93       397
   macro avg       0.93      0.93      0.93       397
weighted avg       0.93      0.93      0.93       397

Confusion Matrix:
[[182   9]
 [ 19 187]]

Top Feature Importances:
BP_History        0.453891
Family_History    0.105736
Stress_Score      0.104706
Age               0.104182
Smoking_Status    0.075481
BMI               0.057588
Sleep_Duration    0.053501
Salt_Intake       0.040836
Medication        0.003089
Exercise_Level    0.000989
dtype: float64


In [10]:
print("\n" + "=" * 50)
print("STEP 9: Saving PKL Files")
print("=" * 50)

output_dir = './'
os.makedirs(output_dir, exist_ok=True)

pickle.dump(model,           open(output_dir + 'model.pkl',           'wb'))
pickle.dump(scaler,          open(output_dir + 'scaler.pkl',          'wb'))
pickle.dump(encoders,        open(output_dir + 'encoders.pkl',        'wb'))
pickle.dump(target_encoder,  open(output_dir + 'target_encoder.pkl',  'wb'))
pickle.dump(list(X.columns), open(output_dir + 'feature_columns.pkl', 'wb'))

print("  ✓ model.pkl saved")
print("  ✓ scaler.pkl saved")
print("  ✓ encoders.pkl saved")
print("  ✓ target_encoder.pkl saved")
print("  ✓ feature_columns.pkl saved")

print("\n" + "=" * 50)
print("ALL DONE!")
print("=" * 50)



STEP 9: Saving PKL Files
  ✓ model.pkl saved
  ✓ scaler.pkl saved
  ✓ encoders.pkl saved
  ✓ target_encoder.pkl saved
  ✓ feature_columns.pkl saved

ALL DONE!
