In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
df = pd.read_csv('disease_diagnosis.csv')

# Split Blood Pressure into Systolic and Diastolic
df[['Systolic', 'Diastolic']] = df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)

# Encode categorical variables
label_encoders = {}
for col in ['Gender', 'Symptom_1', 'Symptom_2', 'Symptom_3']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare feature set
X = df[['Age', 'Gender', 'Symptom_1', 'Symptom_2', 'Symptom_3',
        'Heart_Rate_bpm', 'Body_Temperature_C', 'Oxygen_Saturation_%',
        'Systolic', 'Diastolic']]

# Encode target
le_diagnosis = LabelEncoder()
y_diagnosis = le_diagnosis.fit_transform(df['Diagnosis'])

# Train Random Forest
rf_diagnosis = RandomForestClassifier(n_estimators=100, random_state=42)
rf_diagnosis.fit(X, y_diagnosis)

# Get feature importances
importance_diag = pd.Series(rf_diagnosis.feature_importances_, index=X.columns).sort_values(ascending=False)

print("\n--- Diagnosis Feature Importance ---\n")
print(importance_diag)


# ===== Feature Importance for Severity =====

# Encode target
le_severity = LabelEncoder()
y_severity = le_severity.fit_transform(df['Severity'])

# Train Random Forest
rf_severity = RandomForestClassifier(n_estimators=100, random_state=42)
rf_severity.fit(X, y_severity)

# Get feature importances
importance_sev = pd.Series(rf_severity.feature_importances_, index=X.columns).sort_values(ascending=False)

print("\n--- Severity Feature Importance ---\n")
print(importance_sev)



--- Diagnosis Feature Importance ---

Body_Temperature_C     0.240022
Oxygen_Saturation_%    0.165435
Symptom_2              0.122303
Symptom_3              0.117460
Symptom_1              0.108233
Heart_Rate_bpm         0.069179
Systolic               0.056963
Age                    0.056380
Diastolic              0.053960
Gender                 0.010064
dtype: float64

--- Severity Feature Importance ---

Body_Temperature_C     0.169709
Oxygen_Saturation_%    0.168288
Symptom_2              0.139320
Symptom_3              0.132756
Symptom_1              0.128428
Heart_Rate_bpm         0.072874
Systolic               0.062101
Age                    0.058043
Diastolic              0.057034
Gender                 0.011447
dtype: float64
