In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset
disease_df = pd.read_csv('disease_diagnosis.csv')

In [3]:
# Preprocessing

# Split Blood Pressure
bp_split = disease_df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)
disease_df['BP_Systolic'] = bp_split[0]
disease_df['BP_Diastolic'] = bp_split[1]

# One-Hot Encode Symptoms
all_symptoms = pd.unique(
    disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].values.ravel()
)
for sym in all_symptoms:
    disease_df[f"symptom_{sym}"] = disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].isin([sym]).any(axis=1).astype(int)

# Features and Target
features = ['Heart_Rate_bpm', 'Body_Temperature_C',
            'Oxygen_Saturation_%', 'BP_Systolic', 'BP_Diastolic'] + \
           [f"symptom_{sym}" for sym in all_symptoms]

X = disease_df[features]
# Target Columns
y = disease_df[['Diagnosis', 'Severity']]

# Encode each target
diagnosis_encoder = LabelEncoder()
severity_encoder = LabelEncoder()

y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
y['Severity'] = severity_encoder.fit_transform(y['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [4]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- LightGBM MultiOutput Classifier ---
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=6,  # <- Limit depth
    reg_alpha=1.0,
    reg_lambda=1.0,
    random_state=42
)

multi_lgb = MultiOutputClassifier(lgb_model)
multi_lgb.fit(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 295
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 13
[LightGBM] [Info] Start training from score -1.779337
[LightGBM] [Info] Start training from score -0.543650
[LightGBM] [Info] Start training from score -1.922438
[LightGBM] [Info] Start training from score -3.881251
[LightGBM] [Info] Start training from score -2.479919
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 295
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 13
[LightGBM] [Info] Start training from score -0.408909
[LightGBM] [Info] Start training from 

In [5]:
# --- Prediction ---
y_pred = multi_lgb.predict(X_test_scaled)

# Diagnosis Evaluation
print("\nDiagnosis Classification Report:")
print(classification_report(y_test['Diagnosis'], y_pred[:, 0]))

print("\nDiagnosis Confusion Matrix:")
print(confusion_matrix(y_test['Diagnosis'], y_pred[:, 0]))

# Severity Evaluation
print("\nSeverity Classification Report:")
print(classification_report(y_test['Severity'], y_pred[:, 1]))

print("\nSeverity Confusion Matrix:")
print(confusion_matrix(y_test['Severity'], y_pred[:, 1]))


Diagnosis Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00       238
           2       1.00      1.00      1.00        58
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        29

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Diagnosis Confusion Matrix:
[[ 64   0   0   0   0]
 [  0 238   0   0   0]
 [  0   0  58   0   0]
 [  0   0   0  11   0]
 [  0   0   0   0  29]]

Severity Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00        58
           2       1.00      1.00      1.00        75

    accuracy                           1.00       400
   macro avg       1.00    

In [26]:
import joblib

joblib.dump(multi_lgb, 'lgbm_diagnosis_model.pkl')

print("Model saved successfully!")

Model saved successfully!


In [28]:
#Load Model
lgbm_diag_loaded = joblib.load('lgbm_diagnosis_model.pkl')

# Predict on new data (replace X_test with your actual input)
diag_predictions = lgbm_diag_loaded.predict(X_test)

print("Diagnosis Predictions:", diag_predictions)

Diagnosis Predictions: [[1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [2 1]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [2 1]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 1]
 [1 0]
 [2 

In [29]:
disease_df.head()

Unnamed: 0,Patient_ID,Age,Gender,Symptom_1,Symptom_2,Symptom_3,Heart_Rate_bpm,Body_Temperature_C,Blood_Pressure_mmHg,Oxygen_Saturation_%,...,BP_Systolic,BP_Diastolic,symptom_Fatigue,symptom_Sore throat,symptom_Fever,symptom_Cough,symptom_Body ache,symptom_Shortness of breath,symptom_Headache,symptom_Runny nose
0,1,74,0,Fatigue,Sore throat,Fever,69,39.4,132/91,94,...,132.0,91.0,1,1,1,0,0,0,0,0
1,2,66,1,Sore throat,Fatigue,Cough,95,39.0,174/98,98,...,174.0,98.0,1,1,0,1,0,0,0,0
2,3,32,0,Body ache,Sore throat,Fatigue,77,36.8,136/60,96,...,136.0,60.0,1,1,0,0,1,0,0,0
3,4,21,1,Shortness of breath,Headache,Cough,72,38.9,147/82,99,...,147.0,82.0,0,0,0,1,0,1,1,0
4,5,53,0,Runny nose,Sore throat,Fatigue,100,36.6,109/106,92,...,109.0,106.0,1,1,0,0,0,0,0,1


In [30]:
import joblib

# Assuming these are your trained scaler and model
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(multi_lgb, 'lgbm_diagnosis_model.pkl')

['lgbm_diagnosis_model.pkl']

In [None]:
import joblib
import numpy as np
import pandas as pd

# Load saved scaler and model
scaler = joblib.load('scaler.pkl')
multi_lgb = joblib.load('lgbm_diagnosis_model.pkl')

# List of all possible symptoms (adjust this to match your training set)
all_symptoms = [
    'Fatigue',
    'Sore throat',
    'Fever',
    'Cough',
    'Body ache',
    'Shortness of breath',
    'Headache',
    'Runny nose'
]

# Define mappings for diagnosis and severity
diagnosis_labels = {
    0: "CommonCold",
    1: "Flu",
    2: "Pneumonia",
    3: "Bronchitis",
    4: "ViralFever"
}

severity_labels = {
    0: "Mild",
    1: "Moderate",
    2: "Severe"
}


def create_feature_vector(symptom1, symptom2, symptom3, age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic):
    # Create a binary vector for symptoms
    symptom_vector = [0] * len(all_symptoms)
    for sym in [symptom1, symptom2, symptom3]:
        if sym in all_symptoms:
            index = all_symptoms.index(sym)
            symptom_vector[index] = 1

    # Append other numeric features
    additional_features = [age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic]
    full_features = symptom_vector + additional_features
    
    # Scale features (assumes the same scaler used during training)
    scaled_features = scaler.transform([full_features])

    return scaled_features

# Example user input
symptom1 = input("Enter Symptom A: ")
symptom2 = input("Enter Symptom B: ")
symptom3 = input("Enter Symptom C: ")
age = int(input("Enter Age: "))

# Gender Input — string based
gender_input = input("Enter Gender (Male or Female): ").strip().lower()
if gender_input == "male":
    gender = 1
elif gender_input == "female":
    gender = 0
else:
    print("Invalid gender input! Please enter 'Male' or 'Female'.")
    exit()

heart_rate = int(input("Enter Heart Rate: "))
body_temp = float(input("Enter Body Temperature: "))	
oxy_saturation = int(input("Enter Oxygen Saturation: "))

# Accept blood pressure in "systolic/diastolic" format
bp_input = input("Enter Blood Pressure (Systolic/Diastolic): ")
try:
    bp_systolic, bp_diastolic = map(int, bp_input.strip().split('/'))
except ValueError:
    print("Invalid format! Please enter as Systolic/Diastolic (e.g. 132/91)")
    exit()

# Create feature vector
X_input = create_feature_vector(symptom1, symptom2, symptom3, age, gender, heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic)

# Make prediction
prediction = multi_lgb.predict(X_input)

# Map numerical prediction to labels
diagnosis = diagnosis_labels.get(prediction[0][0], "Unknown Diagnosis")
severity = severity_labels.get(prediction[0][1], "Unknown Severity")

print(f"User inputs: {symptom1}, {symptom2}, {symptom3}, {age}, {gender_input}, {heart_rate}, {body_temp}, {oxy_saturation}, {bp_input}")

print(f"Predicted Diagnosis: {diagnosis}, {severity}")

User inputs: Fever, Cough, Headache, 69, female, 102, 40.0, 99, 176/99
Predicted Diagnosis: Common Cold, Severe


