# Import Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset

In [8]:
df = pd.read_csv('/content/cleaned_health_dataset.csv')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,all_symptoms
0,Roseola,Runny nose,Rash,Red eyes,Loss of appetite,Rashes,Feverish,,,,,,,Runny nose Rash Red eyes Loss of appetite Rash...
1,Roseola,High fever,Runny nose,Irritability,Headache,Cough,Rashes,Chills,,,,,,High fever Runny nose Irritability Headache Co...
2,Norovirus Infection,Stomach cramps,Nausea,Fatigue,Abdominal pain,Loss of appetite,Dehydration,Chills,Sweating,Rashes,General discomfort,Fever,,Stomach cramps Nausea Fatigue Abdominal pain L...
3,Roseola,High fever,Runny nose,Irritability,Fatigue,Red eyes,Diarrhea,Vomiting,Feverish,,,,,High fever Runny nose Irritability Fatigue Red...
4,Norovirus Infection,Diarrhea,Low-grade fever,Nausea,Headache,Fatigue,Loss of appetite,Dehydration,Chills,Sweating,General discomfort,Irritability,,Diarrhea Low-grade fever Nausea Headache Fatig...


# Combine Symptoms

In [9]:
symptom_cols = [col for col in df.columns if 'Symptom' in col]
df['all_symptoms'] = df[symptom_cols].fillna('').apply(lambda x: ' '.join(x), axis=1)
df['all_symptoms'] = df['all_symptoms'].str.strip()
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,all_symptoms
0,Roseola,Runny nose,Rash,Red eyes,Loss of appetite,Rashes,Feverish,,,,,,,Runny nose Rash Red eyes Loss of appetite Rash...
1,Roseola,High fever,Runny nose,Irritability,Headache,Cough,Rashes,Chills,,,,,,High fever Runny nose Irritability Headache Co...
2,Norovirus Infection,Stomach cramps,Nausea,Fatigue,Abdominal pain,Loss of appetite,Dehydration,Chills,Sweating,Rashes,General discomfort,Fever,,Stomach cramps Nausea Fatigue Abdominal pain L...
3,Roseola,High fever,Runny nose,Irritability,Fatigue,Red eyes,Diarrhea,Vomiting,Feverish,,,,,High fever Runny nose Irritability Fatigue Red...
4,Norovirus Infection,Diarrhea,Low-grade fever,Nausea,Headache,Fatigue,Loss of appetite,Dehydration,Chills,Sweating,General discomfort,Irritability,,Diarrhea Low-grade fever Nausea Headache Fatig...


# Encode Disease

In [10]:
le = LabelEncoder()
df['Disease_Encoded'] = le.fit_transform(df['Disease'])
df[['Disease', 'Disease_Encoded']].head()

Unnamed: 0,Disease,Disease_Encoded
0,Roseola,10
1,Roseola,10
2,Norovirus Infection,8
3,Roseola,10
4,Norovirus Infection,8


# Vectorize

In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['all_symptoms'])
y = df['Disease_Encoded']

X.shape


(9409, 180)

# Train-Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


# Train Model

In [13]:
model = MultinomialNB()
model.fit(X_train, y_train)


# Accuracy

In [14]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9689757756056099

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00        11
           3       1.00      0.99      1.00       198
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00        34
           8       0.94      0.97      0.95       690
           9       1.00      1.00      1.00        24
          10       1.00      0.99      0.99       659
          11       1.00      1.00      1.00         3
          12       1.00      1.00      1.00        43
          14       0.96      0.94      0.95       667
          15       1.00      0.80      0.89         5
          16       1.00      1.00      1.00         1

    accuracy              

# Prediction System

In [15]:
def predict_top_diseases(symptom_list, top_k=3):
    # Convert symptom list into a single string
    symptom_string = ' '.join(symptom_list)

    # Vectorize the symptoms
    vectorized = vectorizer.transform([symptom_string])

    # Get probabilities for all classes
    probabilities = model.predict_proba(vectorized)[0]

    # Get top K indices (sorted in descending order)
    top_indices = probabilities.argsort()[::-1][:top_k]

    # Prepare result: list of (disease, percentage)
    results = []
    for idx in top_indices:
        disease = le.inverse_transform([idx])[0]
        percent = round(probabilities[idx] * 100, 2)
        results.append((disease, percent))

    return results

# Test Prediction

In [16]:
symptoms = ["fever", "headache", "body pain"]
predict_top_diseases(symptoms)


[('Norovirus Infection', np.float64(47.29)),
 ('Shigellosis (Bacillary Dysentery)', np.float64(33.29)),
 ('Roseola', np.float64(18.48))]

# User Input

In [17]:
user_input = input("Enter symptoms separated by commas: ")

user_symptoms = [s.strip() for s in user_input.split(',')]

results = predict_top_diseases(user_symptoms)

print("\nTop Predicted Diseases:")
for disease, prob in results:
    print(f"{disease} --> {prob}%")


Enter symptoms separated by commas: fever, cough, cold

Top Predicted Diseases:
Roseola --> 65.96%
Hand, Foot, and Mouth Disease --> 29.41%
Pertussis (Whooping Cough) --> 3.24%
