Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

Preprocessing

In [None]:
file_path = 'medical_conversations.csv'

df = pd.read_csv(file_path)

print(df.head())

                                       conversations  disease
0  User: I’ve been sneezing a lot today and my no...  allergy
1  User: I’ve developed a rash after eating some ...  allergy
2  User: My eyes are swollen and itchy, and I can...  allergy
3  User: I’ve been getting headaches and a stuffy...  allergy
4  User: Every time I eat nuts, my mouth itches. ...  allergy


In [None]:
df = pd.read_csv("medical_conversations.csv")

def extract_user_statements(text):
    user_responses = []
    exchanges = text.split(" </s> ")

    for exchange in exchanges:
        if exchange.startswith("User:"):
            user_responses.append(exchange.replace("User: ", "").strip())

    return " </s> ".join(user_responses)

df["User"] = df["conversations"].apply(extract_user_statements)

df = df[["User", "disease"]]

df.to_csv("user_only_medical_conversations.csv", index=False)

print("Bot responses removed. Cleaned dataset saved as 'user_only_medical_conversations.csv'.")


Bot responses removed. Cleaned dataset saved as 'user_only_medical_conversations.csv'.


In [None]:
file_path = 'Disease_symptom_and_patient_profile_dataset.csv'

df = pd.read_csv(file_path)

print(df.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  


In [None]:
def check_missing_values(file_path):
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading file: {e}")
        return

    missing_values = df.isnull().sum()

    missing_data = missing_values[missing_values > 0]

    if not missing_data.empty:
        print("Columns with missing values:")
        print(missing_data)
    else:
        print("No missing values found.")
file_path = 'user_only_medical_conversations.csv'
check_missing_values(file_path)

No missing values found.


In [None]:
file_path = 'Disease_symptom_and_patient_profile_dataset.csv'
check_missing_values(file_path)

No missing values found.


In [None]:
def check_duplicates(file_path):
    df = pd.read_csv(file_path)

    print(f"Number of duplicate rows: {df.duplicated().sum()}")


In [None]:
file_path = 'user_only_medical_conversations.csv'
check_duplicates(file_path)


Number of duplicate rows: 0


In [None]:
file_path = 'Disease_symptom_and_patient_profile_dataset.csv'
check_duplicates(file_path)

Number of duplicate rows: 49


In [None]:
def remove_duplicates(file_path, output_path):
    df = pd.read_csv(file_path)

    df.drop_duplicates(inplace=True)

    df.to_csv(output_path, index=False)
    print(f"Duplicates removed. Cleaned dataset saved as '{output_path}'.")

In [None]:
remove_duplicates("Disease_symptom_and_patient_profile_dataset.csv", "cleaned_Disease_data.csv")


Duplicates removed. Cleaned dataset saved as 'cleaned_Disease_data.csv'.


In [None]:
file_path = 'cleaned_Disease_data.csv'
check_duplicates(file_path)

Number of duplicate rows: 0


In [None]:
df2 = pd.read_csv("cleaned_Disease_data.csv")

def generate_conversation(row):
    symptoms = []
    for col in ["Fever", "Cough", "Fatigue", "Difficulty Breathing"]:
        if row[col] == "Yes":
            symptoms.append(col.lower())

    profile = f"A {row['Age']}-year-old {row['Gender'].lower()} with {', '.join(symptoms)}."
    medical_info = f"Blood pressure: {row['Blood Pressure']}, Cholesterol level: {row['Cholesterol Level']}."
    outcome = f"Diagnosis result: {row['Outcome Variable']}."

    return f"User: {profile} {medical_info} {outcome}"

df2["User"] = df2.apply(generate_conversation, axis=1)

df2_transformed = df2[["User", "Disease"]].rename(columns={"Disease": "disease"})

df2_transformed.to_csv("transformed_medical_data.csv", index=False)

print("Transformed dataset saved as 'transformed_medical_data.csv'.")


Transformed dataset saved as 'transformed_medical_data.csv'.


In [None]:
df1 = pd.read_csv("user_only_medical_conversations.csv")
df2 = pd.read_csv("transformed_medical_data.csv")

df1_transformed = df1[['User', 'disease']]
df2_transformed = df2[['User', 'disease']]

combined_df = pd.concat([df1_transformed, df2_transformed], ignore_index=True)

combined_df.to_csv("combined_medical_data.csv", index=False)

print("Combined dataset saved as 'combined_medical_data.csv'.")


Combined dataset saved as 'combined_medical_data.csv'.


In [None]:
import pandas as pd
from transformers import AutoTokenizer

df = pd.read_csv('combined_medical_data.csv')

print(df.head())

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

df['tokens'] = df['User'].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=128))

print(df[['User', 'tokens']].head())

df.to_csv('tokenized_combined_dataset.csv', index=False)


                                                User  disease
0  I’ve been sneezing a lot today and my nose fee...  allergy
1  I’ve developed a rash after eating some strawb...  allergy
2  My eyes are swollen and itchy, and I can’t sto...  allergy
3  I’ve been getting headaches and a stuffy nose ...  allergy
4  Every time I eat nuts, my mouth itches. </s> N...  allergy


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

                                                User  \
0  I’ve been sneezing a lot today and my nose fee...   
1  I’ve developed a rash after eating some strawb...   
2  My eyes are swollen and itchy, and I can’t sto...   
3  I’ve been getting headaches and a stuffy nose ...   
4  Every time I eat nuts, my mouth itches. </s> N...   

                                        tokens  
0  [input_ids, token_type_ids, attention_mask]  
1  [input_ids, token_type_ids, attention_mask]  
2  [input_ids, token_type_ids, attention_mask]  
3  [input_ids, token_type_ids, attention_mask]  
4  [input_ids, token_type_ids, attention_mask]  


In [None]:
df = pd.read_csv('tokenized_combined_dataset.csv')

def normalize_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    contractions = {
        "can't": "cannot",
        "won't": "will not",
        "don't": "do not",
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "i'm": "i am",
        "he's": "he is",
        "she's": "she is",
        "they're": "they are"
    }

    words = text.split()
    words = [contractions.get(word, word) for word in words]
    text = " ".join(words)

    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    normalized_text = " ".join(words)

    return normalized_text

df['normalized_conversations'] = df['User'].apply(normalize_text)

print(df[['User', 'normalized_conversations']].head())

df.to_csv('normalized_tokenized_cleaned_combined_dataset.csv', index=False)


                                                User  \
0  I’ve been sneezing a lot today and my nose fee...   
1  I’ve developed a rash after eating some strawb...   
2  My eyes are swollen and itchy, and I can’t sto...   
3  I’ve been getting headaches and a stuffy nose ...   
4  Every time I eat nuts, my mouth itches. </s> N...   

                            normalized_conversations  
0  ive sneezing lot today nose feel congested im ...  
1  ive developed rash eating strawberry yes time ...  
2  eye swollen itchy cant stop sneezing yes usual...  
3  ive getting headache stuffy nose day think all...  
4              every time eat nut mouth itch itching  


In [None]:
!pip install datasets

import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

df = pd.read_csv("normalized_tokenized_cleaned_combined_dataset.csv")
df = df[["User", "disease"]]

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["disease"])
label_classes = list(label_encoder.classes_)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(example):
    return tokenizer(example["User"], padding="max_length", truncation=True, max_length=128)

dataset = Dataset.from_pandas(df[["User", "label"]])
tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

split = tokenized.train_test_split(test_size=0.15, seed=42)
train_dataset = split["train"]
test_dataset = split["test"]

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_classes)
)

training_args = TrainingArguments(
    output_dir="./bert-medical-chatbot-full",
    evaluation_strategy="epoch",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

trainer.train()

model.save_pretrained("./bert-medical-chatbot-full")
tokenizer.save_pretrained("./bert-medical-chatbot-full")



Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
!pip install gradio

import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model_path = "./bert-medical-chatbot-full"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()

labels = label_classes

def classify_symptoms(symptoms):
    inputs = tokenizer(symptoms, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    label = labels[prediction]
    return f"You might be suffering from {label}."

gr.Interface(
    fn=classify_symptoms,
    inputs="text",
    outputs="text",
    title="🩺 BERT Medical Diagnosis Chatbot",
    description="Enter your symptoms and receive a possible diagnosis from a fine-tuned BERT model."
).launch(share=True)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

eval_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

metrics = eval_trainer.evaluate()

print(" Evaluation Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")