***Model Training For Symptom to Disease Prediction***

**Loading the dataset**

In [1]:
import pandas as pd

dataset1 = pd.read_csv('datasets/Symptom2Disease.csv')
dataset1.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [2]:
dataset2 = pd.read_csv('datasets/Diseases_Symptoms.csv')
dataset2.head()

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


In [3]:
dataset3 = pd.read_csv('datasets/dataset.csv')
dataset3.head()

Unnamed: 0,disease,symptoms,cures,doctor,risk level
0,flu,"fever,cough,sore throat,runny or stuffy nose,m...","over-the-counter medications,rest,fluids","family doctor,urgent care",low (0.1%
1,bronchitis,"cough,mucus production,shortness of breath,che...","antibiotics,over-the-counter medications,rest,...","family doctor,pulmonologist",low (0.5%
2,pneumonia,"fever,cough,shortness of breath,chest pain,fat...","antibiotics,over-the-counter medications,rest,...","family doctor,pulmonologist",moderate (1%)
3,heart attack,"chest pain,shortness of breath,nausea,vomiting...",emergency medical services,cardiologist,high (20%)
4,stroke,"sudden weakness,numbness on one side of the bo...",emergency medical services,neurologist,high (15%)


In [4]:
#dropping unnecessary columns and merging the two datasets

#dropping Treatment column from d2
dataset2 = dataset2.drop(columns=['Treatments'])
dataset3 = dataset3.drop(columns=['cures', 'doctor', 'risk level'])
# Rename columns for clarity
dataset1 = dataset1.rename(columns={'label': 'Disease', 'text': 'Symptoms'})
dataset2 = dataset2.rename(columns={'Name': 'Disease'})
dataset3 = dataset3.rename(columns={'disease': 'Disease', 'symptoms': 'Symptoms'})

# Combine the datasets
dataset = pd.concat([dataset1[['Disease', 'Symptoms']], dataset2[['Disease', 'Symptoms']], dataset3[['Disease', 'Symptoms']]], ignore_index=True)
dataset.head()

Unnamed: 0,Disease,Symptoms
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [5]:
print(dataset.shape)

(1699, 2)


**Analyzing the Dataset**

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1699 entries, 0 to 1698
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Disease   1699 non-null   object
 1   Symptoms  1699 non-null   object
dtypes: object(2)
memory usage: 26.7+ KB


In [7]:
# Check for missing values
missing_values = dataset.isnull().sum()
missing_values

Disease     0
Symptoms    0
dtype: int64

In [8]:
dataset.describe().T

Unnamed: 0,count,unique,top,freq
Disease,1699,498,diabetes,52
Symptoms,1699,1627,"I've been feeling extremely scratchy, sick, an...",4


In [9]:
dataset['Disease'].value_counts()

Disease
diabetes                          52
Psoriasis                         51
Migraine                          51
Pneumonia                         51
Arthritis                         51
                                  ..
Hypothyroidism                     1
Otosclerosis                       1
Complex Regional Pain Syndrome     1
Conversion Disorder                1
lupus                              1
Name: count, Length: 498, dtype: int64

In [10]:
import string
def remove_punctuation(text):
    if isinstance(text,str):
        return text.translate(str.maketrans('','',string.punctuation))
    return text
def to_lowercase(text):
    if isinstance(text, str):
        return text.lower()
    return text
dataset['Symptoms'] = dataset['Symptoms'].apply(remove_punctuation)

dataset['Symptoms'] = dataset['Symptoms'].apply(to_lowercase)


In [12]:
import spacy
nlp = spacy.load("en_core_sci_md")
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text,ent.label_)for ent in doc.ents]
    return entities

dataset['Entities'] = dataset['Symptoms'].apply(extract_entities)
dataset.head()

  serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"])  # type: ignore[misc]


Unnamed: 0,Disease,Symptoms,Entities
0,Psoriasis,i have been experiencing a skin rash on my arm...,"[(skin rash, ENTITY), (arms legs, ENTITY), (to..."
1,Psoriasis,my skin has been peeling especially on my knee...,"[(skin, ENTITY), (peeling, ENTITY), (knees elb..."
2,Psoriasis,i have been experiencing joint pain in my fing...,"[(joint pain, ENTITY), (fingers wrists, ENTITY..."
3,Psoriasis,there is a silver like dusting on my skin espe...,"[(silver, ENTITY), (dusting, ENTITY), (skin, E..."
4,Psoriasis,my nails have small dents or pits in them and ...,"[(nails, ENTITY), (pits, ENTITY), (inflammator..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(token_pattern = r'[a-zA-Z]+')

X = vectorizer.fit_transform(dataset['Symptoms'])

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(dataset['Disease'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6676470588235294
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       1.00      1.00      1.00        11
           9       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          29       0.00      0.00      0.00         1
          32       0.81      1.00      0.90        13
          33       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         1
          52       0.42      1.00      0.59         5
          54       0.00      0.00      0.00         1
          55       0.00      0.00      0.00         1
          56       0.00      0.00      0.00         1
          61       0.71      1.00      0.83        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
new_symptoms = ["I am feeling dizzy"]
new_X = vectorizer.transform(new_symptoms)

predicted_label = nb_model.predict(new_X)
predicted_disease = label_encoder.inverse_transform(predicted_label)

print(f'Predicted Disease: {predicted_disease[0]}')

Predicted Disease: Cervical spondylosis
