***Model Training For Symptom to Disease Prediction***

**Loading the dataset**

In [1]:
import pandas as pd

dataset1 = pd.read_csv('datasets/Symptom2Disease.csv')
dataset1.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [2]:
dataset4 = pd.read_csv('datasets/DiseaseAndSymp.csv')
dataset4.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [3]:
#cleaning and merging the symptoms columns for dataset4
dataset4['Symptoms'] = dataset4[['Symptom_1', 'Symptom_2', 'Symptom_3', 
                                 'Symptom_4','Symptom_5','Symptom_6',
                                 'Symptom_7','Symptom_8','Symptom_9',
                                 'Symptom_10','Symptom_11','Symptom_12',
                                 'Symptom_13','Symptom_14','Symptom_15',
                                 'Symptom_16','Symptom_17']].fillna('').agg(', '.join, axis=1).str.strip()
dataset4.drop(['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4','Symptom_5','Symptom_6',
               'Symptom_7','Symptom_8','Symptom_9','Symptom_10','Symptom_11','Symptom_12',
               'Symptom_13','Symptom_14','Symptom_15','Symptom_16','Symptom_17'], axis=1, inplace=True)
dataset4['Symptoms'] = dataset4['Symptoms'].str.replace('_', ' ', regex=True)
dataset4.head()

Unnamed: 0,Disease,Symptoms
0,Fungal infection,"itching, skin rash, nodal skin eruptions, d..."
1,Fungal infection,"skin rash, nodal skin eruptions, dischromic ..."
2,Fungal infection,"itching, nodal skin eruptions, dischromic p..."
3,Fungal infection,"itching, skin rash, dischromic patches, , ,..."
4,Fungal infection,"itching, skin rash, nodal skin eruptions, , ..."


In [4]:

# Rename columns for clarity
dataset1 = dataset1.rename(columns={'label': 'Disease', 'text': 'Symptoms'})


# Combine the datasets
dataset = pd.concat([dataset1[['Disease', 'Symptoms']], dataset4[['Disease', 'Symptoms']]], ignore_index=True)
dataset.head()

Unnamed: 0,Disease,Symptoms
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [5]:
print(dataset.shape)

(6120, 2)


**Analyzing the Dataset**

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6120 entries, 0 to 6119
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Disease   6120 non-null   object
 1   Symptoms  6120 non-null   object
dtypes: object(2)
memory usage: 95.8+ KB


In [7]:
# Check for missing values
missing_values = dataset.isnull().sum()
missing_values

Disease     0
Symptoms    0
dtype: int64

In [8]:
dataset.describe().T

Unnamed: 0,count,unique,top,freq
Disease,6120,50,Psoriasis,170
Symptoms,6120,1457,"muscle weakness, stiff neck, swelling joints...",90


In [9]:
dataset['Disease'].value_counts()

Disease
Psoriasis                                  170
Pneumonia                                  170
Malaria                                    170
Cervical spondylosis                       170
Migraine                                   170
Bronchial Asthma                           170
Acne                                       170
Arthritis                                  170
Jaundice                                   170
Fungal infection                           170
Dengue                                     170
Impetigo                                   170
Chicken pox                                170
Typhoid                                    170
Common Cold                                170
(vertigo) Paroymsal  Positional Vertigo    120
Tuberculosis                               120
hepatitis A                                120
Hepatitis B                                120
Hepatitis C                                120
Hepatitis D                                120
Hepat

In [10]:
import string
def remove_punctuation(text):
    if isinstance(text,str):
        return text.translate(str.maketrans('','',string.punctuation))
    return text
def to_lowercase(text):
    if isinstance(text, str):
        return text.lower()
    return text
dataset['Symptoms'] = dataset['Symptoms'].apply(remove_punctuation)

dataset['Symptoms'] = dataset['Symptoms'].apply(to_lowercase)


In [11]:
import spacy
nlp = spacy.load("en_core_sci_md")
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text,ent.label_)for ent in doc.ents]
    return entities

dataset['Entities'] = dataset['Symptoms'].apply(extract_entities)
dataset.head()

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Unnamed: 0,Disease,Symptoms,Entities
0,Psoriasis,i have been experiencing a skin rash on my arm...,"[(skin rash, ENTITY), (arms legs, ENTITY), (to..."
1,Psoriasis,my skin has been peeling especially on my knee...,"[(skin, ENTITY), (peeling, ENTITY), (knees elb..."
2,Psoriasis,i have been experiencing joint pain in my fing...,"[(joint pain, ENTITY), (fingers wrists, ENTITY..."
3,Psoriasis,there is a silver like dusting on my skin espe...,"[(silver, ENTITY), (dusting, ENTITY), (skin, E..."
4,Psoriasis,my nails have small dents or pits in them and ...,"[(nails, ENTITY), (pits, ENTITY), (inflammator..."


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(token_pattern = r'[a-zA-Z]+')

X = vectorizer.fit_transform(dataset['Entities'].apply(str))
 
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(dataset['Disease'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9256535947712419
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        22
           2       0.97      1.00      0.98        29
           3       1.00      1.00      1.00        22
           4       0.97      1.00      0.98        31
           5       0.95      1.00      0.97        37
           6       0.86      1.00      0.93        32
           7       0.83      1.00      0.91        35
           8       0.84      0.91      0.87        34
           9       1.00      1.00      1.00        21
          10       0.72      1.00      0.84        29
          11       0.82      0.90      0.86        30
          12       0.96      1.00      0.98        22
          13       1.00      0.42      0.59        12
          14       0.82      1.00      0.90        32
          15       1.00      1.00      1.00        23
          16       1.00      0.93      0.97        4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
import numpy as np

new_symptoms =['I am feeling dizzy, headache, sweating']
new_X = vectorizer.transform(new_symptoms)

predicted_disease = label_encoder.inverse_transform(nb_model.predict(new_X))
# Get the probabilities for each class
probabilities = nb_model.predict_proba(new_X)

# Get the indices of the top 3 probabilities
top_3_indices = np.argsort(probabilities, axis=1)[:,-3:]

# Get the corresponding disease names
top_3_diseases = label_encoder.inverse_transform(top_3_indices[0])

print(f'Top 3 predicted diseases: {top_3_diseases}')
print(f'disease: {predicted_disease}')

Top 3 predicted diseases: ['Hypertension' 'Cervical spondylosis' 'Malaria']
disease: ['Malaria']


In [14]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.988562091503268
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        22
           4       1.00      1.00      1.00        31
           5       1.00      1.00      1.00        37
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        35
           8       0.91      0.91      0.91        34
           9       1.00      1.00      1.00        21
          10       1.00      1.00      1.00        29
          11       0.94      0.97      0.95        30
          12       1.00      1.00      1.00        22
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00        32
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        45

In [15]:
new_symptoms =['I am feeling dizzy, headache, sweating']
new_X = vectorizer.transform(new_symptoms)
predicted_disease = label_encoder.inverse_transform(svm_model.predict(new_X))
# Get the probabilities for each class
probabilities = svm_model.decision_function(new_X)

# Get the indices of the top 3 probabilities
top_3_indices = np.argsort(probabilities, axis=1)[:,-3:]

# Get the corresponding disease names
top_3_diseases = label_encoder.inverse_transform(top_3_indices[0])

print(f'Top 3 predicted diseases: {top_3_diseases}')
print(f'disease: {predicted_disease}')

Top 3 predicted diseases: ['diabetes' 'drug reaction' 'Hypertension']
disease: ['Hypertension']


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9656862745098039
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        22
           4       1.00      1.00      1.00        31
           5       0.93      1.00      0.96        37
           6       0.94      1.00      0.97        32
           7       0.97      1.00      0.99        35
           8       0.84      0.91      0.87        34
           9       1.00      1.00      1.00        21
          10       0.88      0.97      0.92        29
          11       0.80      0.80      0.80        30
          12       1.00      1.00      1.00        22
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00        32
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        4

In [17]:
new_symptoms =['I am feeling dizzy, headache, sweating']
new_X = vectorizer.transform(new_symptoms)
predicted_disease = label_encoder.inverse_transform(knn_model.predict(new_X))
# Get the probabilities for each class
probabilities = knn_model.predict_proba(new_X)

# Get the indices of the top 3 probabilities
top_3_indices = np.argsort(probabilities, axis=1)[:,-3:]

# Get the corresponding disease names
top_3_diseases = label_encoder.inverse_transform(top_3_indices[0])

print(f'Top 3 predicted diseases: {top_3_diseases}')
print(f'disease: {predicted_disease}')

Top 3 predicted diseases: ['Diabetes ' 'Dengue' 'Malaria']
disease: ['Malaria']


In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.9869281045751634
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        22
           4       1.00      1.00      1.00        31
           5       0.97      1.00      0.99        37
           6       1.00      1.00      1.00        32
           7       0.97      1.00      0.99        35
           8       0.89      0.94      0.91        34
           9       1.00      1.00      1.00        21
          10       1.00      1.00      1.00        29
          11       1.00      0.83      0.91        30
          12       1.00      1.00      1.00        22
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00        32
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        4

In [19]:
new_symptoms =['I am feeling dizzy, headache, sweating']
new_X = vectorizer.transform(new_symptoms)
predicted_disease = label_encoder.inverse_transform(rf_model.predict(new_X))
# Get the probabilities for each class
probabilities = rf_model.predict_proba(new_X)

# Get the indices of the top 3 probabilities
top_3_indices = np.argsort(probabilities, axis=1)[:,-3:]

# Get the corresponding disease names
top_3_diseases = label_encoder.inverse_transform(top_3_indices[0])

print(f'Top 3 predicted diseases: {top_3_diseases}')
print(f'disease: {predicted_disease}')

Top 3 predicted diseases: ['diabetes' 'Malaria' 'Hypertension']
disease: ['Hypertension']


In [20]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.988562091503268
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        29
           3       1.00      1.00      1.00        22
           4       1.00      1.00      1.00        31
           5       1.00      1.00      1.00        37
           6       1.00      1.00      1.00        32
           7       1.00      1.00      1.00        35
           8       0.89      0.91      0.90        34
           9       1.00      1.00      1.00        21
          10       0.97      1.00      0.98        29
          11       0.93      0.93      0.93        30
          12       1.00      1.00      1.00        22
          13       1.00      1.00      1.00        12
          14       1.00      1.00      1.00        32
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        45

In [21]:
new_symptoms =['I am feeling dizzy, headache, sweating']
new_X = vectorizer.transform(new_symptoms)
predicted_disease = label_encoder.inverse_transform(mlp_model.predict(new_X))
# Get the probabilities for each class
probabilities = mlp_model.predict_proba(new_X)

# Get the indices of the top 3 probabilities
top_3_indices = np.argsort(probabilities, axis=1)[:,-3:]

# Get the corresponding disease names
top_3_diseases = label_encoder.inverse_transform(top_3_indices[0])

print(f'Top 3 predicted diseases: {top_3_diseases}')
print(f'disease: {predicted_disease}')

Top 3 predicted diseases: ['drug reaction' 'Hypertension' 'Malaria']
disease: ['Malaria']


In [None]:
import pickle

# Save the model and vectorizer
with open("best_model.pkl", "wb") as f:
    pickle.dump({"model": rf_model, "vectorizer": vectorizer, "label_encoder": label_encoder}, f)

print("Model saved successfully!")


Model saved successfully!
