In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [2]:
# Load Dataset
disease_df = pd.read_csv('disease_diagnosis.csv')

In [3]:
# Preprocessing

# Extract systolic and diastolic from Blood Pressure
bp_split = disease_df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)
disease_df['BP_Systolic'] = bp_split[0]
disease_df['BP_Diastolic'] = bp_split[1]

# One-Hot Encode Symptoms
all_symptoms = pd.unique(
    disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].values.ravel()
)

for sym in all_symptoms:
    disease_df[f"symptom_{sym}"] = disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].isin([sym]).any(axis=1).astype(int)

# Drop unused columns
features = ['Heart_Rate_bpm', 'Body_Temperature_C', 
            'Oxygen_Saturation_%', 'BP_Systolic', 'BP_Diastolic'] + \
           [f"symptom_{sym}" for sym in all_symptoms]


In [4]:

X = disease_df[features]
# Target Columns
y = disease_df[['Diagnosis', 'Severity']]

# Encode each target
diagnosis_encoder = LabelEncoder()
severity_encoder = LabelEncoder()

y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
y['Severity'] = severity_encoder.fit_transform(y['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Severity'] = severity_encoder.fit_transform(y['Severity'])


In [5]:
# Initialize XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1 
)
multi_xgb = MultiOutputClassifier(xgb)

# Train the model
multi_xgb.fit(X_train, y_train)

# Predict on test data
y_pred = multi_xgb.predict(X_test)

# Decode Predictions
y_pred_diagnosis = diagnosis_encoder.inverse_transform(y_pred[:, 0])
y_pred_severity = severity_encoder.inverse_transform(y_pred[:, 1])

In [6]:
# Diagnosis Evaluation
print("\nDiagnosis Classification Report:")
print(classification_report(y_test['Diagnosis'], y_pred[:, 0]))

print("\nDiagnosis Confusion Matrix:")
print(confusion_matrix(y_test['Diagnosis'], y_pred[:, 0]))

# Severity Evaluation
print("\nSeverity Classification Report:")
print(classification_report(y_test['Severity'], y_pred[:, 1]))

print("\nSeverity Confusion Matrix:")
print(confusion_matrix(y_test['Severity'], y_pred[:, 1]))


Diagnosis Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00       238
           2       1.00      1.00      1.00        58
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        29

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Diagnosis Confusion Matrix:
[[ 64   0   0   0   0]
 [  0 238   0   0   0]
 [  0   0  58   0   0]
 [  0   0   0  11   0]
 [  0   0   0   0  29]]

Severity Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00        58
           2       1.00      1.00      1.00        75

    accuracy                           1.00       400
   macro avg       1.00    

In [7]:
import joblib

# Assuming these are your trained scaler and model
joblib.dump(multi_xgb, 'xgb_diagnosis_model.pkl')

['xgb_diagnosis_model.pkl']

In [33]:
import joblib
import numpy as np
import pandas as pd

# Load saved scaler and model
multi_xgb = joblib.load('xgb_diagnosis_model.pkl')

# List of all possible symptoms (adjust this to match your training set)
all_symptoms = [
    'Fatigue',
    'Sore throat',
    'Fever',
    'Cough',
    'Body ache',
    'Shortness of breath',
    'Headache',
    'Runny nose'
]

# Define mappings for diagnosis and severity
diagnosis_labels = {
    0: "Bronchitis",
    1: "Common Cold",
    2: "Flu",
    3: "Pneumonia",
    4: "Viral Fever"
}

severity_labels = {
    0: "Mild",
    1: "Moderate",
    2: "Severe"
}


def create_feature_vector(heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic, symptom1, symptom2, symptom3):
    # Create a binary vector for symptoms
    symptom_vector = [0] * len(all_symptoms)
    for sym in [symptom1, symptom2, symptom3]:
        if sym in all_symptoms:
            index = all_symptoms.index(sym)
            symptom_vector[index] = 1

    # Append other numeric features
    additional_features = [heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic]
    full_features = additional_features + symptom_vector

    return [(full_features)]

# Example user input

heart_rate = int(input("Enter Heart Rate: "))
body_temp = float(input("Enter Body Temperature: "))	
oxy_saturation = int(input("Enter Oxygen Saturation: "))

# Accept blood pressure in "systolic/diastolic" format
bp_input = input("Enter Blood Pressure (Systolic/Diastolic): ")
try:
    bp_systolic, bp_diastolic = map(int, bp_input.strip().split('/'))
except ValueError:
    print("Invalid format! Please enter as Systolic/Diastolic (e.g. 132/91)")
    exit()
symptom1 = input("Enter Symptom A: ")
symptom2 = input("Enter Symptom B: ")
symptom3 = input("Enter Symptom C: ")


# Create feature vector
X_input = create_feature_vector(heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic, symptom1, symptom2, symptom3)

print("Input Vector before prediction: ", X_input)

# Make prediction
prediction = multi_xgb.predict(X_input)

# Map numerical prediction to labels
diagnosis = diagnosis_labels.get(prediction[0][0], "Unknown Diagnosis")
severity = severity_labels.get(prediction[0][1], "Unknown Severity")

print(f"\nUser inputs: {heart_rate}, {body_temp}, {oxy_saturation}, {bp_input}, {symptom1}, {symptom2}, {symptom3}\n")

print(f"Predicted Diagnosis: {diagnosis}, {severity}")

Input Vector before prediction:  [[101, 37.0, 95, 152, 89, 0, 0, 0, 0, 1, 1, 1, 0]]

User inputs: 101, 37.0, 95, 152/89, Body ache, Shortness of breath, Headache

Predicted Diagnosis: Common Cold, Mild


In [35]:
diagnosis_classes = ['Bronchitis', 'CommonCold','Flu','Pneumonia', 'ViralFever']
severity_classes = ['Mild', 'Moderate', 'Severe']

# Get prediction probabilities
probs = multi_xgb.predict_proba(X_input)

# Map to labels
diagnosis_probs = dict(zip(diagnosis_classes, probs[0][0]))
severity_probs = dict(zip(severity_classes, probs[1][0]))

print("Diagnosis Probabilities:")
for label, prob in diagnosis_probs.items():
    print(f"{label}: {prob:.4f}")

print("\nSeverity Probabilities:")
for label, prob in severity_probs.items():
    print(f"{label}: {prob:.4f}")

Diagnosis Probabilities:
Bronchitis: 0.0002
CommonCold: 0.9992
Flu: 0.0002
Pneumonia: 0.0003
ViralFever: 0.0002

Severity Probabilities:
Mild: 0.9995
Moderate: 0.0002
Severe: 0.0002


In [11]:
y['Diagnosis'].value_counts()


Diagnosis
1    1167
0     334
2     292
4     163
3      44
Name: count, dtype: int64

In [12]:
y['Severity'].value_counts()

Severity
0    1330
2     378
1     292
Name: count, dtype: int64