In [5]:
# Import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
data = pd.read_csv('Healthcare_Data_Preprocessed.csv')


In [7]:
# Explore data
print(data.info())
print(data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   10000 non-null  float64
 1   BMI                   10000 non-null  float64
 2   Blood_Pressure        10000 non-null  float64
 3   Cholesterol           10000 non-null  float64
 4   Glucose_Level         10000 non-null  float64
 5   Heart_Rate            10000 non-null  float64
 6   Sleep_Hours           10000 non-null  float64
 7   Exercise_Hours        10000 non-null  float64
 8   Water_Intake          10000 non-null  float64
 9   Stress_Level          10000 non-null  float64
 10  Target                10000 non-null  int64  
 11  Smoking               10000 non-null  int64  
 12  Alcohol               10000 non-null  int64  
 13  Diet                  10000 non-null  int64  
 14  MentalHealth          10000 non-null  int64  
 15  PhysicalActivity    

In [13]:

print(data)
# Handle missing values
data.fillna(data.median(), inplace=True)

       Age   BMI  Blood_Pressure  Cholesterol  Glucose_Level  Heart_Rate  \
0     37.0  26.0           111.0        198.0           99.0        72.0   
1     37.0  24.0           121.0        199.0          103.0        75.0   
2     81.0  27.0           147.0        203.0          100.0        74.0   
3     25.0  21.0           150.0        199.0          102.0        70.0   
4     24.0  26.0           146.0        202.0           99.0        76.0   
...    ...   ...             ...          ...            ...         ...   
9995   5.0  22.0           109.0        203.0           98.0        75.0   
9996  94.0  26.0           144.0        203.0           96.0        72.0   
9997  37.0  23.0           185.0        198.0          103.0        72.0   
9998  50.0  29.0           166.0        200.0          100.0        74.0   
9999  69.0  29.0           178.0        203.0          100.0        75.0   

      Sleep_Hours  Exercise_Hours  Water_Intake  Stress_Level  ...  Diet  \
0          

In [11]:
print(data)

       Age   BMI  Blood_Pressure  Cholesterol  Glucose_Level  Heart_Rate  \
0     37.0  26.0           111.0        198.0           99.0        72.0   
1     37.0  24.0           121.0        199.0          103.0        75.0   
2     81.0  27.0           147.0        203.0          100.0        74.0   
3     25.0  21.0           150.0        199.0          102.0        70.0   
4     24.0  26.0           146.0        202.0           99.0        76.0   
...    ...   ...             ...          ...            ...         ...   
9995   5.0  22.0           109.0        203.0           98.0        75.0   
9996  94.0  26.0           144.0        203.0           96.0        72.0   
9997  37.0  23.0           185.0        198.0          103.0        72.0   
9998  50.0  29.0           166.0        200.0          100.0        74.0   
9999  69.0  29.0           178.0        203.0          100.0        75.0   

      Sleep_Hours  Exercise_Hours  Water_Intake  Stress_Level  ...  Diet  \
0          

In [22]:
# Handle missing values
data.fillna(data.median(), inplace=True)

In [24]:
print(data)

       Age   BMI  Blood_Pressure  Cholesterol  Glucose_Level  Heart_Rate  \
0     37.0  26.0           111.0        198.0           99.0        72.0   
1     37.0  24.0           121.0        199.0          103.0        75.0   
2     81.0  27.0           147.0        203.0          100.0        74.0   
3     25.0  21.0           150.0        199.0          102.0        70.0   
4     24.0  26.0           146.0        202.0           99.0        76.0   
...    ...   ...             ...          ...            ...         ...   
9995   5.0  22.0           109.0        203.0           98.0        75.0   
9996  94.0  26.0           144.0        203.0           96.0        72.0   
9997  37.0  23.0           185.0        198.0          103.0        72.0   
9998  50.0  29.0           166.0        200.0          100.0        74.0   
9999  69.0  29.0           178.0        203.0          100.0        75.0   

      Sleep_Hours  Exercise_Hours  Water_Intake  Stress_Level  ...  Diet  \
0          

In [29]:
# Correct negative age values
data['Age'] = data['Age'].apply(lambda x: abs(x) if x < 0 else x)
print(data)

       Age   BMI  Blood_Pressure  Cholesterol  Glucose_Level  Heart_Rate  \
0     37.0  26.0           111.0        198.0           99.0        72.0   
1     37.0  24.0           121.0        199.0          103.0        75.0   
2     81.0  27.0           147.0        203.0          100.0        74.0   
3     25.0  21.0           150.0        199.0          102.0        70.0   
4     24.0  26.0           146.0        202.0           99.0        76.0   
...    ...   ...             ...          ...            ...         ...   
9995   5.0  22.0           109.0        203.0           98.0        75.0   
9996  94.0  26.0           144.0        203.0           96.0        72.0   
9997  37.0  23.0           185.0        198.0          103.0        72.0   
9998  50.0  29.0           166.0        200.0          100.0        74.0   
9999  69.0  29.0           178.0        203.0          100.0        75.0   

      Sleep_Hours  Exercise_Hours  Water_Intake  Stress_Level  ...  Diet  \
0          

In [37]:
le = LabelEncoder()


In [43]:
for col in ['Diet_Type_Vegan', 'Blood_Group_AB']:  # Adjusted column name
    data[col] = le.fit_transform(data[col])

In [49]:
# Feature scaling
scaler = StandardScaler()
numerical_cols = ['BMI', 'Blood_Pressure', 'Cholesterol', 'Glucose_Level', 'Heart_Rate', 'Sleep_Hours', 'Exercise_Hours']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
print(data[numerical_cols])

           BMI  Blood_Pressure  Cholesterol  Glucose_Level  Heart_Rate  \
0     0.148367       -0.724844    -0.594586      -0.504531   -0.888158   
1    -0.880176       -0.362916    -0.119715       1.304310    0.851736   
2     0.662639        0.578097     1.779768      -0.052321    0.271771   
3    -2.422992        0.686675    -0.119715       0.852100   -2.048087   
4     0.148367        0.541904     1.304897      -0.504531    1.431701   
...        ...             ...          ...            ...         ...   
9995 -1.908720       -0.797230     1.779768      -0.956742    0.851736   
9996  0.148367        0.469519     1.779768      -1.861162   -0.888158   
9997 -1.394448        1.953424    -0.594586       1.304310   -0.888158   
9998  1.691183        1.265760     0.355156      -0.052321    0.271771   
9999  1.691183        1.700074     1.779768      -0.052321    0.851736   

      Sleep_Hours  Exercise_Hours  
0       -0.156315       -0.156315  
1       -0.156315       -0.156315  
2  

In [51]:
# Split data into features and target
X = data.drop(columns=['Target'])
y = data['Target']

In [53]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred)}")



[[877 130]
 [115 878]]
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      1007
           1       0.87      0.88      0.88       993

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000

ROC-AUC Score: 0.8775464997784892
