In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load Dataset
disease_df = pd.read_csv('disease_diagnosis.csv')

In [3]:
#Preprocessing

# Split Blood Pressure into Systolic and Diastolic
bp_split = disease_df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)
disease_df['BP_Systolic'] = bp_split[0]
disease_df['BP_Diastolic'] = bp_split[1]

# One-Hot Encode Symptoms
all_symptoms = pd.unique(
    disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].values.ravel()
)

for sym in all_symptoms:
    disease_df[f"symptom_{sym}"] = disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].isin([sym]).any(axis=1).astype(int)

# Feature and Target Selection
features = ['Heart_Rate_bpm', 'Body_Temperature_C',
            'Oxygen_Saturation_%', 'BP_Systolic', 'BP_Diastolic'] + \
           [f"symptom_{sym}" for sym in all_symptoms]

X = disease_df[features]
# Target Columns
y = disease_df[['Diagnosis', 'Severity']]

# Encode each target
diagnosis_encoder = LabelEncoder()
severity_encoder = LabelEncoder()

# Scale Features (important for logistic regression!)
scaler_lr = StandardScaler()
X_scaled = scaler_lr.fit_transform(X)

y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
y['Severity'] = severity_encoder.fit_transform(y['Severity'])

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Severity'] = severity_encoder.fit_transform(y['Severity'])


In [4]:
# Initialize Decision Tree Classifier
lr = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
multi_lr = MultiOutputClassifier(lr)

#Train Model
multi_lr.fit(X_train,y_train)

y_pred = multi_lr.predict(X_test)

# Decode Predictions
y_pred_diagnosis = diagnosis_encoder.inverse_transform(y_pred[:, 0])
y_pred_severity = severity_encoder.inverse_transform(y_pred[:, 1])



In [5]:
# Diagnosis Evaluation
print("\nDiagnosis Classification Report:")
print(classification_report(y_test['Diagnosis'], y_pred[:, 0]))

print("\nDiagnosis Confusion Matrix:")
print(confusion_matrix(y_test['Diagnosis'], y_pred[:, 0]))

# Severity Evaluation
print("\nSeverity Classification Report:")
print(classification_report(y_test['Severity'], y_pred[:, 1]))

print("\nSeverity Confusion Matrix:")
print(confusion_matrix(y_test['Severity'], y_pred[:, 1]))


Diagnosis Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        64
           1       0.96      0.96      0.96       238
           2       0.90      0.91      0.91        58
           3       0.86      0.55      0.67        11
           4       0.75      0.83      0.79        29

    accuracy                           0.93       400
   macro avg       0.88      0.84      0.86       400
weighted avg       0.93      0.93      0.93       400


Diagnosis Confusion Matrix:
[[ 62   1   1   0   0]
 [  0 228   1   1   8]
 [  2   3  53   0   0]
 [  1   0   4   6   0]
 [  0   5   0   0  24]]

Severity Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       267
           1       0.79      0.84      0.82        58
           2       0.94      0.84      0.89        75

    accuracy                           0.93       400
   macro avg       0.90    

In [6]:
import joblib

# Assuming these are your trained scaler and model
joblib.dump(scaler_lr, 'scaler_lr.pkl')
joblib.dump(multi_lr, 'LR_diagnosis_model.pkl')

['LR_diagnosis_model.pkl']

In [18]:
import joblib
import numpy as np
import pandas as pd

# Load saved scaler and model
scaler_lr = joblib.load('scaler_lr.pkl')
multi_lr = joblib.load('LR_diagnosis_model.pkl')

# List of all possible symptoms (adjust this to match your training set)
all_symptoms = [
    'Fatigue',
    'Sore throat',
    'Fever',
    'Cough',
    'Body ache',
    'Shortness of breath',
    'Headache',
    'Runny nose'
]

# Define mappings for diagnosis and severity
diagnosis_labels = {
    0: "Bronchitis",
    1: "Common Cold",
    2: "Flu",
    3: "Pneumonia",
    4: "Viral Fever"
}

severity_labels = {
    0: "Mild",
    1: "Moderate",
    2: "Severe"
}


def create_feature_vector(heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic, symptom1, symptom2, symptom3):
    # Create a binary vector for symptoms
    symptom_vector = [0] * len(all_symptoms)
    for sym in [symptom1, symptom2, symptom3]:
        if sym in all_symptoms:
            index = all_symptoms.index(sym)
            symptom_vector[index] = 1

    # Append other numeric features
    additional_features = [heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic]
    full_features = additional_features + symptom_vector 
    
    # Scale features (assumes the same scaler used during training)
    scaled_features = scaler_lr.transform([full_features])

    return scaled_features

# Example user input

heart_rate = int(input("Enter Heart Rate: "))
body_temp = float(input("Enter Body Temperature: "))	
oxy_saturation = int(input("Enter Oxygen Saturation: "))

# Accept blood pressure in "systolic/diastolic" format
bp_input = input("Enter Blood Pressure (Systolic/Diastolic): ")
try:
    bp_systolic, bp_diastolic = map(int, bp_input.strip().split('/'))
except ValueError:
    print("Invalid format! Please enter as Systolic/Diastolic (e.g. 132/91)")
    exit()
symptom1 = input("Enter Symptom A: ")
symptom2 = input("Enter Symptom B: ")
symptom3 = input("Enter Symptom C: ")

# Create feature vector
X_input = create_feature_vector(heart_rate, body_temp, oxy_saturation, bp_systolic, bp_diastolic, symptom1, symptom2, symptom3)

print("Input Vector before prediction: ", X_input)

# Make prediction
prediction = multi_lr.predict(X_input)

# Map numerical prediction to labels
diagnosis = diagnosis_labels.get(prediction[0][0], "Unknown Diagnosis")
severity = severity_labels.get(prediction[0][1], "Unknown Severity")

print(f"\nUser inputs: {symptom1}, {symptom2}, {symptom3}, {heart_rate}, {body_temp}, {oxy_saturation}, {bp_input}\n")

print(f"Predicted Diagnosis: {diagnosis}, {severity}")

Input Vector before prediction:  [[ 1.14156007  1.03779501 -1.5705436   0.16431602  1.5418952  -0.77956208
  -0.73138185 -0.79204754  1.25988973 -0.76471305  1.28277147 -0.77707737
   1.28277147]]

User inputs: Cough, Runny nose, Shortness of breath, 109, 39.1, 90, 140/116

Predicted Diagnosis: Bronchitis, Severe


