In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load Dataset
disease_df = pd.read_csv('disease_diagnosis.csv')

In [3]:
# Split Blood Pressure into Systolic and Diastolic
bp_split = disease_df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)
disease_df['BP_Systolic'] = bp_split[0]
disease_df['BP_Diastolic'] = bp_split[1]

# One-Hot Encode Symptoms
all_symptoms = pd.unique(
    disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].values.ravel()
)

for sym in all_symptoms:
    disease_df[f"symptom_{sym}"] = disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].isin([sym]).any(axis=1).astype(int)

# Feature and Target Selection
features = ['Heart_Rate_bpm', 'Body_Temperature_C',
            'Oxygen_Saturation_%', 'BP_Systolic', 'BP_Diastolic'] + \
           [f"symptom_{sym}" for sym in all_symptoms]

X = disease_df[features]
# Target Columns
y = disease_df[['Diagnosis', 'Severity']]

# Encode each target
diagnosis_encoder = LabelEncoder()
severity_encoder = LabelEncoder()

y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
y['Severity'] = severity_encoder.fit_transform(y['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Severity'] = severity_encoder.fit_transform(y['Severity'])


In [4]:
# Initialize Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_rf = MultiOutputClassifier(rf)

# Train Model
multi_rf.fit(X_train, y_train)

y_pred = multi_rf.predict(X_test)

# Decode Predictions
y_pred_diagnosis = diagnosis_encoder.inverse_transform(y_pred[:, 0])
y_pred_severity = severity_encoder.inverse_transform(y_pred[:, 1])

In [5]:
# Diagnosis Evaluation
print("\nDiagnosis Classification Report:")
print(classification_report(y_test['Diagnosis'], y_pred[:, 0]))

print("\nDiagnosis Confusion Matrix:")
print(confusion_matrix(y_test['Diagnosis'], y_pred[:, 0]))

# Severity Evaluation
print("\nSeverity Classification Report:")
print(classification_report(y_test['Severity'], y_pred[:, 1]))

print("\nSeverity Confusion Matrix:")
print(confusion_matrix(y_test['Severity'], y_pred[:, 1]))


Diagnosis Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00       238
           2       0.95      1.00      0.97        58
           3       1.00      0.73      0.84        11
           4       1.00      0.97      0.98        29

    accuracy                           0.99       400
   macro avg       0.99      0.94      0.96       400
weighted avg       0.99      0.99      0.99       400


Diagnosis Confusion Matrix:
[[ 64   0   0   0   0]
 [  0 238   0   0   0]
 [  0   0  58   0   0]
 [  0   0   3   8   0]
 [  0   1   0   0  28]]

Severity Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       0.95      1.00      0.97        58
           2       1.00      0.96      0.98        75

    accuracy                           0.99       400
   macro avg       0.98    

In [24]:
import joblib

# Assuming these are your trained scaler and model
joblib.dump(multi_rf, 'rf_diagnosis_model.pkl')

['rf_diagnosis_model.pkl']

In [26]:
import joblib
import numpy as np
import pandas as pd

# Load saved scaler and model
multi_rf = joblib.load('rf_diagnosis_model.pkl')

# List of all possible symptoms (adjust this to match your training set)
all_symptoms = [
    'Fatigue',
    'Sore throat',
    'Fever',
    'Cough',
    'Body ache',
    'Shortness of breath',
    'Headache',
    'Runny nose'
]

# Define mappings for diagnosis and severity
diagnosis_labels = {
    0: "Common Cold",
    1: "Flu",
    2: "Pneumonia",
    3: "Bronchitis",
    4: "Viral Fever"
}

severity_labels = {
    0: "Mild",
    1: "Moderate",
    2: "Severe"
}


def create_feature_vector(symptom1, symptom2, symptom3, age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic):
    # Create a binary vector for symptoms
    symptom_vector = [0] * len(all_symptoms)
    for sym in [symptom1, symptom2, symptom3]:
        if sym in all_symptoms:
            index = all_symptoms.index(sym)
            symptom_vector[index] = 1

    # Append other numeric features
    additional_features = [age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic]
    full_features = symptom_vector + additional_features
    
    return np.array([full_features])  # Correct: returns 2D

# Example user input
symptom1 = input("Enter Symptom A: ")
symptom2 = input("Enter Symptom B: ")
symptom3 = input("Enter Symptom C: ")
age = int(input("Enter Age: "))

# Gender Input — string based
gender_input = input("Enter Gender (Male or Female): ").strip().lower()
if gender_input == "male":
    gender = 1
elif gender_input == "female":
    gender = 0
else:
    print("Invalid gender input! Please enter 'Male' or 'Female'.")
    exit()

heart_rate = int(input("Enter Heart Rate: "))
body_temp = float(input("Enter Body Temperature: "))	
oxy_saturation = int(input("Enter Oxygen Saturation: "))

# Accept blood pressure in "systolic/diastolic" format
bp_input = input("Enter Blood Pressure (Systolic/Diastolic): ")
try:
    bp_systolic, bp_diastolic = map(int, bp_input.strip().split('/'))
except ValueError:
    print("Invalid format! Please enter as Systolic/Diastolic (e.g. 132/91)")
    exit()

# Create feature vector
X_input = create_feature_vector(symptom1, symptom2, symptom3, age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic)

print("Input Vector before prediction: ", X_input)

# Make prediction
prediction = multi_rf.predict(X_input)

# Map numerical prediction to labels
diagnosis = diagnosis_labels.get(prediction[0][0], "Unknown Diagnosis")
severity = severity_labels.get(prediction[0][1], "Unknown Severity")

print(f"\nUser inputs: {symptom1}, {symptom2}, {symptom3}, {age}, {gender_input}, {heart_rate}, {body_temp}, {oxy_saturation}, {bp_input}\n")

print(f"Predicted Diagnosis: {diagnosis}, {severity}")

Input Vector before prediction:  [[  1.    0.    0.    0.    0.    0.    0.    0.   73.    0.   87.   38.9
   98.  167.   79. ]]

User inputs: Body Ache, Shortness of Breath, Fatigue, 73, female, 87, 38.9, 98, 167/79

Predicted Diagnosis: Common Cold, Severe


