In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from tqdm import tqdm


In [2]:
# Read the input file
with open("/content/draft-1  (3).txt", "r") as file:
    data = file.read()


In [3]:
data

"\ufeffHere are the list of diseases\n\n\n\n\nSymptoms: Cold, Shivering,Headache\nDisease: Fever\nRemedy: Take rest ,Have warm clothes on.\n\n\n\n\nSymptoms: Abdominal pain, nausea, vomiting, fever\n\n\nDisease: Acute pancreatitis\n\n\nRemedy: Fasting and sipping clear liquids can ease strain on the pancreas. Applying a heating pad to the abdomen may help relieve pain. Avoiding alcohol and following a low-fat diet can reduce inflammation.\n\n\nSymptoms: Heavy menstrual bleeding, pelvic pain, painful periods\n\n\nDisease: Adenomyosis\n\n\nRemedy: Applying a warm compress to the lower abdomen can alleviate pain. Relaxation techniques like deep breathing or meditation may also help manage discomfort. Consuming foods rich in omega-3 fatty acids, such as salmon or flaxseeds, may reduce inflammation.\n\n\nSymptoms: Fatigue, weight loss, low blood pressure\n\n\nDisease: Addison’s Disease:\n\n\nRemedy: Maintaining a high-salt diet and staying hydrated are key to managing symptoms. Carrying an 

In [4]:
# Extract symptoms, diseases, and remedies
symptoms = []
diseases = []

In [5]:
current_symptoms = None
current_disease = None

In [6]:
for line in data.split('\n'):
    if line.startswith('Symptoms:'):
        current_symptoms = line.replace('Symptoms: ', '')
    elif line.startswith('Disease:'):
        current_disease = line.replace('Disease: ', '')
        # Append the current symptoms and disease to the lists
        if current_symptoms and current_disease:
            symptoms.append(current_symptoms)
            diseases.append(current_disease)
            # Reset the current symptoms and disease for the next entry
            current_symptoms = None
            current_disease = None
        elif current_symptoms or current_disease:
            # Handle cases where only symptoms or only disease is present
            symptoms.append(current_symptoms)
            diseases.append(current_disease)
            current_symptoms = None
            current_disease = None

In [7]:
# Create a DataFrame
df = pd.DataFrame({'Symptoms': symptoms, 'Disease': diseases})

In [8]:
# Drop rows with missing values
df = df.dropna()

In [9]:
df

Unnamed: 0,Symptoms,Disease
0,"Cold, Shivering,Headache",Fever
1,"Abdominal pain, nausea, vomiting, fever",Acute pancreatitis
2,"Heavy menstrual bleeding, pelvic pain, painful...",Adenomyosis
3,"Fatigue, weight loss, low blood pressure",Addison’s Disease:
4,"Jaundice, abdominal pain, fatigue",Alcohol-related liver
...,...,...
96,Symptoms of acute lymphoblastic leukaemia (AL...,Acute Lymphoblastic Leukaemia (ALL)
97,Symptoms of acute lymphoblastic leukaemia in ...,Acute Lymphoblastic Leukaemia (ALL) in Children
98,Symptoms of acute lymphoblastic leukaemia in ...,Acute Lymphoblastic Leukaemia (ALL) in Teenage...
99,Symptoms of acute myeloid leukaemia may inclu...,Acute Myeloid Leukaemia (AML)


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [10]:
# Label encode the diseases
label_encoder = LabelEncoder()
df['EncodedDisease'] = label_encoder.fit_transform(df['Disease'])

In [11]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
train_encodings = tokenizer(list(train_df["Symptoms"]), truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(list(test_df["Symptoms"]), truncation=True, padding=True, return_tensors="pt")


In [14]:
# Create DataLoader
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"],
                              torch.tensor(list(train_df["EncodedDisease"])))
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)


In [15]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df["Disease"].unique()))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

In [17]:
# Train the model
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")

Epoch 1/50: 100%|██████████| 20/20 [00:56<00:00,  2.83s/it]


Epoch 1/50, Loss: 94.02436065673828


Epoch 2/50: 100%|██████████| 20/20 [00:47<00:00,  2.35s/it]


Epoch 2/50, Loss: 92.68683862686157


Epoch 3/50: 100%|██████████| 20/20 [00:45<00:00,  2.29s/it]


Epoch 3/50, Loss: 89.3231029510498


Epoch 4/50: 100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Epoch 4/50, Loss: 86.81838655471802


Epoch 5/50: 100%|██████████| 20/20 [00:52<00:00,  2.63s/it]


Epoch 5/50, Loss: 85.690669298172


Epoch 6/50: 100%|██████████| 20/20 [00:51<00:00,  2.58s/it]


Epoch 6/50, Loss: 85.38630437850952


Epoch 7/50: 100%|██████████| 20/20 [00:55<00:00,  2.78s/it]


Epoch 7/50, Loss: 82.74191117286682


Epoch 8/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 8/50, Loss: 80.06026983261108


Epoch 9/50: 100%|██████████| 20/20 [00:49<00:00,  2.45s/it]


Epoch 9/50, Loss: 80.17901921272278


Epoch 10/50: 100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Epoch 10/50, Loss: 78.65707492828369


Epoch 11/50: 100%|██████████| 20/20 [00:45<00:00,  2.29s/it]


Epoch 11/50, Loss: 75.60424566268921


Epoch 12/50: 100%|██████████| 20/20 [00:47<00:00,  2.35s/it]


Epoch 12/50, Loss: 73.78282833099365


Epoch 13/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 13/50, Loss: 72.53190970420837


Epoch 14/50: 100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Epoch 14/50, Loss: 70.27666759490967


Epoch 15/50: 100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Epoch 15/50, Loss: 68.64903497695923


Epoch 16/50: 100%|██████████| 20/20 [00:45<00:00,  2.25s/it]


Epoch 16/50, Loss: 65.72424602508545


Epoch 17/50: 100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Epoch 17/50, Loss: 65.4462537765503


Epoch 18/50: 100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Epoch 18/50, Loss: 63.22692799568176


Epoch 19/50: 100%|██████████| 20/20 [00:50<00:00,  2.52s/it]


Epoch 19/50, Loss: 62.3571891784668


Epoch 20/50: 100%|██████████| 20/20 [00:44<00:00,  2.24s/it]


Epoch 20/50, Loss: 59.68550443649292


Epoch 21/50: 100%|██████████| 20/20 [00:46<00:00,  2.30s/it]


Epoch 21/50, Loss: 58.27664375305176


Epoch 22/50: 100%|██████████| 20/20 [00:44<00:00,  2.23s/it]


Epoch 22/50, Loss: 56.01662468910217


Epoch 23/50: 100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


Epoch 23/50, Loss: 54.896607637405396


Epoch 24/50: 100%|██████████| 20/20 [00:44<00:00,  2.24s/it]


Epoch 24/50, Loss: 53.17771053314209


Epoch 25/50: 100%|██████████| 20/20 [00:44<00:00,  2.22s/it]


Epoch 25/50, Loss: 52.098512172698975


Epoch 26/50: 100%|██████████| 20/20 [00:45<00:00,  2.29s/it]


Epoch 26/50, Loss: 50.15094327926636


Epoch 27/50: 100%|██████████| 20/20 [00:45<00:00,  2.30s/it]


Epoch 27/50, Loss: 49.29343628883362


Epoch 28/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 28/50, Loss: 47.62171804904938


Epoch 29/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 29/50, Loss: 45.944549322128296


Epoch 30/50: 100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


Epoch 30/50, Loss: 44.0393682718277


Epoch 31/50: 100%|██████████| 20/20 [00:44<00:00,  2.21s/it]


Epoch 31/50, Loss: 43.08841371536255


Epoch 32/50: 100%|██████████| 20/20 [00:44<00:00,  2.21s/it]


Epoch 32/50, Loss: 41.31469225883484


Epoch 33/50: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]


Epoch 33/50, Loss: 39.96432602405548


Epoch 34/50: 100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


Epoch 34/50, Loss: 38.24849331378937


Epoch 35/50: 100%|██████████| 20/20 [00:44<00:00,  2.23s/it]


Epoch 35/50, Loss: 37.53143322467804


Epoch 36/50: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]


Epoch 36/50, Loss: 36.325223445892334


Epoch 37/50: 100%|██████████| 20/20 [00:44<00:00,  2.25s/it]


Epoch 37/50, Loss: 34.97196638584137


Epoch 38/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 38/50, Loss: 33.216177463531494


Epoch 39/50: 100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Epoch 39/50, Loss: 32.680559396743774


Epoch 40/50: 100%|██████████| 20/20 [00:45<00:00,  2.26s/it]


Epoch 40/50, Loss: 31.42897069454193


Epoch 41/50: 100%|██████████| 20/20 [00:45<00:00,  2.26s/it]


Epoch 41/50, Loss: 29.892828583717346


Epoch 42/50: 100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Epoch 42/50, Loss: 29.424392104148865


Epoch 43/50: 100%|██████████| 20/20 [00:44<00:00,  2.23s/it]


Epoch 43/50, Loss: 28.241414546966553


Epoch 44/50: 100%|██████████| 20/20 [00:46<00:00,  2.31s/it]


Epoch 44/50, Loss: 27.34312617778778


Epoch 45/50: 100%|██████████| 20/20 [00:45<00:00,  2.27s/it]


Epoch 45/50, Loss: 26.244281947612762


Epoch 46/50: 100%|██████████| 20/20 [00:44<00:00,  2.22s/it]


Epoch 46/50, Loss: 25.539703905582428


Epoch 47/50: 100%|██████████| 20/20 [00:47<00:00,  2.36s/it]


Epoch 47/50, Loss: 23.830095052719116


Epoch 48/50: 100%|██████████| 20/20 [00:45<00:00,  2.28s/it]


Epoch 48/50, Loss: 23.29699218273163


Epoch 49/50: 100%|██████████| 20/20 [00:44<00:00,  2.23s/it]


Epoch 49/50, Loss: 22.776617348194122


Epoch 50/50: 100%|██████████| 20/20 [00:46<00:00,  2.32s/it]

Epoch 50/50, Loss: 22.028889894485474





In [18]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert_disease_model")

In [19]:
### prediciton###
import torch

# Load the fine-tuned model
# model = BertForSequenceClassification.from_pretrained("fine_tuned_bert_disease_model")
model = BertForSequenceClassification.from_pretrained("fine_tuned_bert_disease_model", num_labels=len(df["Disease"].unique()))


In [20]:
# Tokenize user input symptoms
user_input_symptoms = "Your input symptoms here"  # Replace with the user's symptoms
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
input_encoding = tokenizer(user_input_symptoms, truncation=True, padding=True, return_tensors="pt")


In [30]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained("fine_tuned_bert_disease_model")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# User input symptoms
user_input_symptoms = input("Enter the symptoms: ")

# Tokenize and preprocess user input
input_encoding = tokenizer(user_input_symptoms, truncation=True, padding=True, return_tensors="pt")

# Make the prediction
with torch.no_grad():
    model.eval()
    output = model(**input_encoding)
    predicted_label = torch.argmax(output.logits).item()

# Convert the predicted label back to the original disease
predicted_disease = label_encoder.inverse_transform([predicted_label])[0]

print(f"Predicted Disease: {predicted_disease}")


Enter the symptoms: mood swings, difficulty sleeping
Predicted Disease: Bipolar disorder


In [31]:
predicted_disease

'Bipolar disorder'

In [32]:
## getting the remedy
# Read remedies from text file and create a mapping
def read_remedies(filename):
    disease_remedies = {}
    with open(filename, 'r') as file:
        symptoms = None
        disease = None
        remedy = None
        for line in file:
            line = line.strip()
            if line.startswith("Symptoms:"):
                symptoms = line[len("Symptoms:"):].strip()
            elif line.startswith("Disease:"):
                disease = line[len("Disease:"):].strip()
            elif line.startswith("Remedy:"):
                remedy = line[len("Remedy:"):].strip()
                if symptoms and disease and remedy:
                    disease_remedies[disease] = {
                        "Symptoms": symptoms,
                        "Remedy": remedy
                    }
                    # Reset variables for next entry
                    symptoms = None
                    disease = None
                    remedy = None
    return disease_remedies

# Example function to get remedy for predicted disease
def get_remedy(predict_disease, disease_remedies):
    return disease_remedies.get(predict_disease, {"Remedy": "Remedy not found"})["Remedy"]

# Example usage
filename = '/content/draft-1  (3).txt'  # Replace with the path to your text file
disease_remedies = read_remedies(filename)

# Example predicted disease (replace this with the actual predicted disease)
predict_disease = predicted_disease

# Get the remedy for the predicted disease
remedy = get_remedy(predict_disease, disease_remedies)

# Display the remedy
print("Remedy for", predict_disease, "is:", remedy)


Remedy for Bipolar disorder is: Establishing a regular sleep schedule and engaging in daily exercise can help stabilize mood. Mindfulness-based practices like meditation or deep breathing exercises may also provide relief. Avoiding alcohol and caffeine is important, as they can trigger mood swings.


In [33]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.3.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m

In [35]:
### translator
from googletrans import Translator

# Read remedies from text file and create a mapping
def read_remedies(filename):
    disease_remedies = {}
    with open(filename, 'r') as file:
        symptoms = None
        disease = None
        remedy = None
        for line in file:
            line = line.strip()
            if line.startswith("Symptoms:"):
                symptoms = line[len("Symptoms:"):].strip()
            elif line.startswith("Disease:"):
                disease = line[len("Disease:"):].strip()
            elif line.startswith("Remedy:"):
                remedy = line[len("Remedy:"):].strip()
                if symptoms and disease and remedy:
                    disease_remedies[disease] = {
                        "Symptoms": symptoms,
                        "Remedy": remedy
                    }
                    # Reset variables for next entry
                    symptoms = None
                    disease = None
                    remedy = None
    return disease_remedies

# Example function to get remedy for predicted disease
def get_remedy(predicted_disease, disease_remedies):
    return disease_remedies.get(predicted_disease, {"Remedy": "Remedy not found"})["Remedy"]

# Translate text to Spanish
def translate_to_spanish(text):
    translator = Translator()
    translated_text = translator.translate(text, src='en', dest='es')
    return translated_text.text

# Example usage
filename = '/content/draft-1  (3).txt'  # Replace with the path to your text file
disease_remedies = read_remedies(filename)

# Example predicted disease (replace this with the actual predicted disease)
# predicted_disease = "Fever"

# Get the remedy for the predicted disease
remedy = get_remedy(predicted_disease, disease_remedies)

# Translate predicted disease and remedy to Spanish
translated_disease_en = predicted_disease
translated_remedy_en = remedy
translated_disease_es = translate_to_spanish(predicted_disease)
translated_remedy_es = translate_to_spanish(remedy)

# Display the output in both English and Spanish
print("Remedy for", translated_disease_en, "in English is:", translated_remedy_en)
print("Remedy for", translated_disease_es, "in Spanish is:", translated_remedy_es)


Remedy for Bipolar disorder in English is: Establishing a regular sleep schedule and engaging in daily exercise can help stabilize mood. Mindfulness-based practices like meditation or deep breathing exercises may also provide relief. Avoiding alcohol and caffeine is important, as they can trigger mood swings.
Remedy for Trastorno bipolar in Spanish is: Establecer un horario de sueño regular y participar en el ejercicio diario puede ayudar a estabilizar el estado de ánimo.Las prácticas basadas en la atención plena como la meditación o los ejercicios de respiración profunda también pueden proporcionar alivio.Evitar el alcohol y la cafeína es importante, ya que pueden provocar cambios de humor.


In [None]:
print()

In [36]:
# # Assuming you already have a test_df DataFrame and tokenizer loaded

# # Tokenize the test data
# test_encodings = tokenizer(list(test_df["Symptoms"]), truncation=True, padding=True, return_tensors="pt")

# # Create DataLoader for the test set
# test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"],
#                               torch.tensor(list(test_df["EncodedDisease"])))
# test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# # Evaluate the model on the test set
# model.eval()
# total_correct = 0
# total_samples = 0

# with torch.no_grad():
#     for batch in tqdm(test_dataloader, desc="Evaluating"):
#         outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
#         predicted_labels = torch.argmax(outputs.logits, dim=1)
#         total_correct += (predicted_labels == batch[2]).sum().item()
#         total_samples += len(batch[2])

# # Calculate accuracy
# accuracy = total_correct / total_samples
# print(f"Accuracy on the test set: {accuracy * 100:.2f}%")
