In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
print("Loading lifestyle dataset...")
df = pd.read_csv('../data/lifestyle_data.csv')

print("\n--- Dataset Preview ---")
print(df.head())

Loading lifestyle dataset...

--- Dataset Preview ---
   age        bmi  servings_veg_fruit_daily  servings_processed_food_weekly  \
0   42  38.172926                         5                               3   
1   51  34.879950                         0                              14   
2   45  34.922600                         3                              12   
3   49  21.266598                         1                              11   
4   42  27.170532                         1                              12   

   hours_exercise_weekly  hours_sleep_daily  alcohol_units_weekly  is_smoker  \
0                     13           8.475477                    23          0   
1                      0           6.923373                    14          0   
2                      8           6.884002                    21          1   
3                     14           6.416330                    14          0   
4                      7           5.452476                    27      

In [3]:
X = df.drop(['heart_disease_risk', 'liver_disease_risk', 'kidney_disease_risk'], axis=1)
y = df[['heart_disease_risk', 'liver_disease_risk', 'kidney_disease_risk']]

print("\n--- Features (X) ---")
print(X.columns.tolist())
print("\n--- Targets (y) ---")
print(y.columns.tolist())


--- Features (X) ---
['age', 'bmi', 'servings_veg_fruit_daily', 'servings_processed_food_weekly', 'hours_exercise_weekly', 'hours_sleep_daily', 'alcohol_units_weekly', 'is_smoker']

--- Targets (y) ---
['heart_disease_risk', 'liver_disease_risk', 'kidney_disease_risk']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
print("\n--- Training Lifestyle Model ---")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
print("Training complete!")


--- Training Lifestyle Model ---
Training complete!


In [7]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Exact Match Accuracy: {accuracy * 100:.2f}%\n")

print("--- Classification Report for Heart Disease Risk ---")
print(classification_report(y_test['heart_disease_risk'], y_pred[:, 0], target_names=['Low Risk', 'High Risk']))

print("--- Classification Report for Liver Disease Risk ---")
print(classification_report(y_test['liver_disease_risk'], y_pred[:, 1], target_names=['Low Risk', 'High Risk']))

print("--- Classification Report for Kidney Disease Risk ---")
print(classification_report(y_test['kidney_disease_risk'], y_pred[:, 2], target_names=['Low Risk', 'High Risk']))


Overall Exact Match Accuracy: 64.75%

--- Classification Report for Heart Disease Risk ---
              precision    recall  f1-score   support

    Low Risk       0.92      0.92      0.92       306
   High Risk       0.73      0.73      0.73        94

    accuracy                           0.88       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.88      0.88      0.88       400

--- Classification Report for Liver Disease Risk ---
              precision    recall  f1-score   support

    Low Risk       0.88      0.93      0.90       302
   High Risk       0.73      0.61      0.67        98

    accuracy                           0.85       400
   macro avg       0.81      0.77      0.78       400
weighted avg       0.84      0.85      0.85       400

--- Classification Report for Kidney Disease Risk ---
              precision    recall  f1-score   support

    Low Risk       0.86      0.92      0.89       302
   High Risk       0.69      0.53     

In [8]:
model_path = '../saved_models/risk_model.joblib'
scaler_path = '../saved_models/risk_scaler.joblib'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"\nLifestyle model saved to: {model_path}")
print(f"Lifestyle scaler saved to: {scaler_path}")
print("\nLifestyle model building is complete!")


Lifestyle model saved to: ../saved_models/risk_model.joblib
Lifestyle scaler saved to: ../saved_models/risk_scaler.joblib

Lifestyle model building is complete!
