In [None]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from datasets import load_dataset

dataset = load_dataset("abachaa/medquad", split="train[:200]")
print(dataset[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetNotFoundError: Dataset 'abachaa/medquad' doesn't exist on the Hub or cannot be accessed.

In [None]:
medical_corpus = []

for item in dataset:
    qa_text = item["question"] + " " + item["answer"]
    processed_text = preprocess_text(qa_text)
    medical_corpus.append(processed_text)

print("Total medical entries:", len(medical_corpus))


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(medical_corpus)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("Medical vectors stored:", index.ntotal)


ModuleNotFoundError: No module named 'faiss'

In [None]:
def medical_chatbot(query):
    processed_query = preprocess_text(query)
    query_embedding = model.encode([processed_query])
    distances, indices = index.search(np.array(query_embedding), 1)
    return medical_corpus[indices[0][0]]


In [None]:
query = "What are symptoms of diabetes?"
response = medical_chatbot(query)

print("User:", query)
print("Bot:", response)


In [None]:
import spacy

nlp = spacy.load("en_core_sci_sm")

def extract_medical_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [None]:
entities = extract_medical_entities("Diabetes causes high blood sugar levels")
print(entities)
