In [119]:
import pandas as pd
import numpy as np
import re

In [120]:
dataset=pd.read_csv('symtoms_df.csv')

In [121]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4
0,0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches
1,1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,
2,2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,
3,3,Fungal infection,itching,skin_rash,dischromic _patches,
4,4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,


In [122]:
dataset.shape

(4920, 6)

In [123]:
dataset['Symptom_4'].fillna('',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Symptom_4'].fillna('',inplace=True)


In [124]:
dataset['Symptoms']=dataset['Symptom_1']+','+dataset['Symptom_2']+','+dataset['Symptom_3']+','+dataset['Symptom_4']

In [125]:
dataset=dataset[['Symptoms','Disease']]

In [126]:
dataset['Symptoms']=dataset['Symptoms'].str.replace('_',' ')

In [127]:
print(dataset.tail(10).to_markdown())

|      | Symptoms                                                                               | Disease                                 |
|-----:|:---------------------------------------------------------------------------------------|:----------------------------------------|
| 4910 | fatigue, weight gain, cold hands and feets, mood swings                                | Hypothyroidism                          |
| 4911 | fatigue, mood swings, weight loss, restlessness                                        | Hyperthyroidism                         |
| 4912 | vomiting, fatigue, anxiety, sweating                                                   | Hypoglycemia                            |
| 4913 | joint pain, neck pain, knee pain, hip joint pain                                       | Osteoarthristis                         |
| 4914 | muscle weakness, stiff neck, swelling joints, movement stiffness                       | Arthritis                               |
| 4915 | vomiting, h

In [128]:
dataset['Disease'].nunique()

41

In [129]:
dataset.head()

Unnamed: 0,Symptoms,Disease
0,"itching, skin rash, nodal skin eruptions, disc...",Fungal infection
1,"skin rash, nodal skin eruptions, dischromic ...",Fungal infection
2,"itching, nodal skin eruptions, dischromic pat...",Fungal infection
3,"itching, skin rash, dischromic patches,",Fungal infection
4,"itching, skin rash, nodal skin eruptions,",Fungal infection


In [130]:
print(dataset.tail().to_markdown())

|      | Symptoms                                                                               | Disease                                 |
|-----:|:---------------------------------------------------------------------------------------|:----------------------------------------|
| 4915 | vomiting, headache, nausea, spinning movements                                         | (vertigo) Paroymsal  Positional Vertigo |
| 4916 | skin rash, pus filled pimples, blackheads, scurring                                    | Acne                                    |
| 4917 | burning micturition, bladder discomfort, foul smell of urine, continuous feel of urine | Urinary tract infection                 |
| 4918 | skin rash, joint pain, skin peeling, silver like dusting                               | Psoriasis                               |
| 4919 | skin rash, high fever, blister, red sore around nose                                   | Impetigo                                |


In [131]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [132]:
df = pd.DataFrame(dataset)

# Tokenize the symptom texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Symptoms'])
sequences = tokenizer.texts_to_sequences(df['Symptoms'])
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Encode Disease labels into one-hot vectors
label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(df['Disease'])
categorical_labels = to_categorical(integer_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, categorical_labels, test_size=0.2, random_state=42)

# --- Step 2: Build the Model ---
vocab_size = len(word_index) + 1  # +1 for the padding token
embedding_dim = 100  # Larger embedding dimension for richer representation

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- Step 3: Train the Model ---
# Define callbacks for early stopping and model checkpointing
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint("best_model.keras", monitor='val_loss', save_best_only=True)
]

history = model.fit(
    X_train, y_train,
    epochs=50,                # Use more epochs for a real dataset
    batch_size=10,             # Adjust batch size based on dataset and resources
    validation_split=0.2,     # Reserve a portion of training data for validation
    callbacks=callbacks,
    verbose=1
)

# --- Step 4: Evaluate the Model ---
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)




Epoch 1/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.2788 - loss: 2.7744 - val_accuracy: 0.9175 - val_loss: 0.4603
Epoch 2/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9353 - loss: 0.4041 - val_accuracy: 0.9518 - val_loss: 0.2183
Epoch 3/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9735 - loss: 0.1912 - val_accuracy: 0.9708 - val_loss: 0.1311
Epoch 4/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9703 - loss: 0.1385 - val_accuracy: 0.9822 - val_loss: 0.0937
Epoch 5/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9800 - loss: 0.1055 - val_accuracy: 0.9860 - val_loss: 0.0715
Epoch 6/50
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9851 - loss: 0.0744 - val_accuracy: 0.9873 - val_loss: 0.0713
Epoch 7/50
[1m315/315

In [137]:
# --- Step 5: Predict on a Sample Input ---
# Example sample input symptoms (note: this string is preprocessed in the same way as training data)
sample_input = " urinate more often"

# Tokenize and pad the sample input
sample_seq = tokenizer.texts_to_sequences([sample_input])
sample_padded = pad_sequences(sample_seq, maxlen=max_seq_length, padding='post')

# Predict the probabilities for each disease
predictions = model.predict(sample_padded)

# Get the index of the highest probability disease
predicted_class = np.argmax(predictions, axis=1)

# Convert the predicted index back to the disease label
predicted_disease = label_encoder.inverse_transform(predicted_class)

print("Input Symptoms:", sample_input)
print("Predicted Disease:", predicted_disease[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Input Symptoms:  urinate more often
Predicted Disease: Tuberculosis


In [138]:
print(df[df['Disease']=='Tuberculosis'].tail().to_markdown())

|      | Symptoms                               | Disease      |
|-----:|:---------------------------------------|:-------------|
| 4740 | chills, vomiting, fatigue, weight loss | Tuberculosis |
| 4781 | chills, vomiting, fatigue, weight loss | Tuberculosis |
| 4822 | chills, vomiting, fatigue, weight loss | Tuberculosis |
| 4863 | chills, vomiting, fatigue, weight loss | Tuberculosis |
| 4904 | chills, vomiting, fatigue, weight loss | Tuberculosis |
