Data PreProcessing

In [None]:
import json
import re
from google.colab import drive

drive.mount('/content/drive')

json_path = '/content/drive/My Drive/Colab Notebooks/Docs/Vasant_Lad_DataSet.json'

# Load dataset
with open(json_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Normalize dosha set
ALL_DOSHAS = {"Vata", "Pitta", "Kapha"}

def normalize_text(text):
    text = text.strip().lower()
    text = re.sub(r'\s+', ' ', text)
    return text

def detect_dosha_type(doshas):
    """Classify dosha combination"""
    unique_doshas = set([d.title() for d in doshas if d])
    if unique_doshas == ALL_DOSHAS:
        return "Tridoshic"
    elif len(unique_doshas) == 1:
        return list(unique_doshas)[0]
    elif len(unique_doshas) > 1:
        return "-".join(sorted(unique_doshas))
    else:
        return "Unknown"

def flatten_symptoms(symptoms_field):
    """Flatten nested symptoms to a simple list of strings"""
    if isinstance(symptoms_field, dict):
        flat = []
        for group in symptoms_field.values():
            if isinstance(group, list):
                flat.extend(group)
        return flat
    elif isinstance(symptoms_field, list):
        return symptoms_field
    else:
        return []

def flatten_remedies(remedies):
    """Flatten remedy instructions into strings"""
    flattened = []

    def recurse(node, path=""):
        if isinstance(node, dict):
            for k, v in node.items():
                recurse(v, f"{path} {k}".strip())
        elif isinstance(node, list):
            for item in node:
                recurse(item, path)
        else:
            text = f"{path}: {node}" if path else f"{node}"
            flattened.append(normalize_text(text))

    if isinstance(remedies, dict):
        recurse(remedies)

    return list(set(flattened))  # Remove duplicates

# Preprocessing
cleaned_data = []

for entry in raw_data:
    disease = entry.get("Disease", "Unknown").strip().title()
    raw_symptoms = entry.get("Symptoms", [])
    symptoms = flatten_symptoms(raw_symptoms)
    symptoms_cleaned = [normalize_text(s) for s in symptoms if isinstance(s, str)]
    raw_remedies = entry.get("Remedies", [])
    remedies = flatten_remedies(raw_remedies)
    remedies_cleaned = [normalize_text(s) for s in remedies if isinstance(s, str)]
    doshas = entry.get("Primary Dosha", [])
    dosha_type = detect_dosha_type(doshas)

    cleaned_entry = {
        "disease": disease,
        "symptoms": symptoms_cleaned,
        "primary_dosha": [d.title() for d in doshas],
        "dosha_type": dosha_type,
        "remedies": remedies_cleaned,
    }

    cleaned_data.append(cleaned_entry)

# Save cleaned version
with open("cleaned_ayurveda_data.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2,ensure_ascii=False)

print(f"Processed {len(cleaned_data)} entries successfully.")


Mounted at /content/drive
Processed 112 entries successfully.


NLP Pipeline by using SentenceTransformer for creating embeddings on entire data

In [None]:
pip install spacy nltk sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.35.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.35.0-py3-none-any.whl.metadata (1.5 k

In [None]:
import json
import spacy
import nltk
import chromadb
import uuid

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer

nltk.download("punkt")

# Load NLP models
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and fast

# Load dataset
# with open("ayurveda_dataset.json", "r", encoding="utf-8") as f:
#    data = json.load(f)
data = cleaned_data

# Helper: Flatten symptom or remedy fields
def flatten_text(item):
    flat = []
    if isinstance(item, dict):
        for val in item.values():
            flat.extend(flatten_text(val))
    elif isinstance(item, list):
        for sub in item:
            flat.extend(flatten_text(sub))
    elif isinstance(item, str):
        flat.append(item)
    return flat

# NLP Pipeline
def process_ayurveda_data(data):
    records = []

    for entry in data:
        disease = entry.get("disease", "Unknown")
        dosha = entry.get("primary_dosha", [])

        # --- Symptoms ---
        symptoms = entry.get("symptoms", [])
        symptom_sentences = sent_tokenize(" ".join(symptoms))
        symptom_embeddings = embedder.encode(symptom_sentences)

        # --- Remedies ---
        remedies = entry.get("remedies", {})
        remedy_sentences = sent_tokenize(" ".join(remedies))
        remedy_embeddings = embedder.encode(remedy_sentences)


        records.append({
            "Disease": disease,
            "Dosha": dosha,
            "Symptoms": symptom_sentences,
            "Symptom Embeddings": symptom_embeddings,
            "Remedies": remedy_sentences,
            "Remedy Embeddings": remedy_embeddings,
        })

    return records

results = process_ayurveda_data(data)

# Preview Output
for r in results[:5]:
    print("Disease:", r["Disease"])
    print("Dosha:", r["Dosha"])
    print("Symptoms:", r["Symptoms"][:3])  # Preview first 3
    print("Remedies:", r["Remedies"][:3])

client     = chromadb.Client()
collection = client.get_or_create_collection(name="ayurveda_symptoms")

for rec_idx, entry in enumerate(results):
    disease  = entry["Disease"]
    dosha    = ", ".join(entry["Dosha"]) # Convert list of doshas to string
    remedies = entry["Remedies"]

    # 3. For each symptom sentence + its embedding
    for sym_idx, (sym_text, sym_emb) in enumerate(zip(entry["Symptoms"], entry["Symptom Embeddings"])):
        # pick the matching remedy sentence (or join all if uneven lengths)
         remedy_meta = remedies[sym_idx] if sym_idx < len(remedies) else " | ".join(remedies)
         metadata = {
            "disease": disease,
            "dosha": dosha,
            "remedy": remedy_meta
        }
         collection.add(
            ids       = [f"{rec_idx}-{sym_idx}"],      # unique ID per symptom
            embeddings= [sym_emb.tolist()],             # convert numpy array to list
            metadatas = [metadata],
            documents = [sym_text]
        )

# (Optional) Persist to disk if using a persist_directory
# client.persist()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Disease: Vata Imbalance
Dosha: ['Vata']
Symptoms: ['fearfulness, nervousness, anxiety scattered or racing thoughts insomnia or interrupted sleep tremors, spasms, twitches restlessness or fidgety limbs dry skin, hair or lips cold hands and feet irregular digestion (bloating, gas, constipation) variable appetite']
Remedies: ['herbal_support: shatavari: ½ tsp powder in warm milk or water to nourish and lubricate diet: warm fluids: sip warm water or herbal teas (ginger, licorice) throughout the day diet: warm, cooked foods: soups, stews, porridges, well-cooked grains yoga_pranayama: mulabandha (root lock) and pelvic floor exercises to stabilize vata herbal_support: brahmi: ¼–½ tsp powder or tincture at bedtime for mental calm and sleep lifestyle: self-massage (abhyanga) with warm sesame oil each morning diet: vata-pacifying tastes: sweet, sour, and salty; include ghee yoga_pranayama: breathing: slow ujjayi or nadi shodhana (alternate-nostril) pranayama diet: digestive spices: ginger, cinna

Generate Training Data

In [None]:
# 1 Install Required Libraries
!pip install spacy



Symptoms Training Dataset 1

In [None]:
#  2 Convert Your Data to spaCy Format
import json
import random
import spacy
from spacy.training.example import Example

# Load cleaned dataset
with open("cleaned_ayurveda_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare training data
training_data = []
for entry in data:
    text = f"The patient has {'; '.join(entry['symptoms'])}."
    entities = []
    seen = set()
    start = text.find("has ") + 4
    symptoms = entry['symptoms']
    for symptom in symptoms:
                if symptom in seen:
                    print(symptom)
                    continue  # avoid duplicates
                seen.add(symptom)
                start = text.find(symptom)
                if start != -1:
                    end = start + len(symptom)
                    overlap = any(start < e[1] and end > e[0] for e in entities)
                    if not overlap:
                        entities.append((start, end, "SYMPTOM"))
    training_data.append((text, {"entities": entities}))

    # Save cleaned version
with open("training_data.json", "w", encoding="utf-8") as f:
    json.dump(training_data, f, indent=2,ensure_ascii=False)


bloating


Symptoms Training Dataset 2

In [None]:
unique_symptoms = {
    symptom
    for entry in data
    for symptom in entry.get("symptoms", [])
}

# 2. (Optional) Sort into a list
unique_symptoms_list = sorted(unique_symptoms)

print(f"Found {len(unique_symptoms_list)} unique symptoms:")
for symptom in unique_symptoms_list:
    print("–", symptom)

import random
import json

# Template sentence patterns
templates = [
    "The patient has {}.",
    "The patient is suffering from {}.",
    "Symptoms include {}.",
    "She has been experiencing {} lately.",
    "{} has been reported by the user.",
    "Signs of {} were observed.",
    "{} occurred after consuming certain foods.",
    "There is persistent {} in the body.",
    "He complains of {}.",
    "Doctors noted {} during examination.",
    "{} seems to worsen at night.",
    "I often experience {} during stressful days.",
    "There are signs of {} under the skin.",
    "He suffers from {} on a regular basis."
]

# Generate NER-style training data
def generate_ner_data(symptoms, templates, n_per_symptom=10):
    ner_data = []

    for symptom in symptoms:
        for _ in range(n_per_symptom):
            template = random.choice(templates)
            sentence = template.format(symptom)
            start = sentence.find(symptom)
            end = start + len(symptom)
            ner_data.append((sentence, {"entities": [(start, end, "SYMPTOM")]}))

    return ner_data

# Generate training examples
training_data = generate_ner_data(unique_symptoms_list, templates, n_per_symptom=8)

# Save as JSON
with open("augmented_symptom_ner_data.json", "w") as f:
    json.dump(training_data, f, indent=2)

print("✅ Augmented training data saved as 'augmented_symptom_ner_data.json'")

Found 477 unique symptoms:
– abdominal bloating, gastric discomfort, colic
– abdominal cramping
– abdominal discomfort
– abdominal discomfort and distension
– abdominal distension and discomfort
– academic or work burnout
– accumulation of fatty deposits under skin
– acid indigestion
– acid reflux
– acidity in an empty stomach
– acidity, heartburn, sour belching
– aggravated by strenuous exercise
– aging
– alcohol, drugs (e.g. lsd, marijuana, cocaine), or medications
– allergies or food sensitivities
– alternating constipation and diarrhea
– ama (toxins) in gi tract
– anger, irritability, frustration, criticism
– anxiety
– anxiety, irritability, restlessness
– appetite changes (loss or overeating) and weight fluctuation
– associated pain or itching
– associated with congestion, cold, cough, allergies
– associated with nausea, irritability, burning eyes
– asthma
– baby refuses or lacks interest in feeding
– back muscle strain or pull
– bad breath
– bleeding during brushing
– bleeding fr

In [None]:
# 3 Train NER Model with spaCy
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding

nlp = spacy.blank("en")  # Start with blank model
ner = nlp.add_pipe("ner")

# Add SYMPTOM label
ner.add_label("SYMPTOM")

# Training loop
optimizer = nlp.begin_training()
for i in range(10):  # epochs
    random.shuffle(training_data)
    losses = {}
    batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.3, losses=losses)
    print(f"Iteration {i+1}, Losses: {losses}")

# Save model
nlp.to_disk("symptom_ner_model")




Iteration 1, Losses: {'ner': np.float32(1243.7311)}
Iteration 2, Losses: {'ner': np.float32(184.02867)}
Iteration 3, Losses: {'ner': np.float32(168.2316)}
Iteration 4, Losses: {'ner': np.float32(73.885895)}
Iteration 5, Losses: {'ner': np.float32(47.55122)}
Iteration 6, Losses: {'ner': np.float32(73.77411)}
Iteration 7, Losses: {'ner': np.float32(86.2469)}
Iteration 8, Losses: {'ner': np.float32(29.15987)}
Iteration 9, Losses: {'ner': np.float32(45.559048)}
Iteration 10, Losses: {'ner': np.float32(57.815395)}


In [None]:
# 4. Test NER
nlp = spacy.load("symptom_ner_model")
doc = nlp("The patient has stomachache.")
for ent in doc.ents:
    print(ent.text, ent.label_)


stomachache SYMPTOM


Remedy Classification Model

In [None]:
user_input = "The patient has headache and stomachache."
doc = nlp(user_input)
extracted_symptoms = [ent.text for ent in doc.ents if ent.label_ == "SYMPTOM"]
print("Extracted Symptoms:", extracted_symptoms)

if not extracted_symptoms:
    print("No symptoms detected.")
else:
    # Step 3: Embed extracted symptoms
    query_embedding = embedder.encode(", ".join(extracted_symptoms)).tolist()

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=3
    )

    for i, doc in enumerate(results['documents'][0]):
        print(f"\nMatch #{i+1}")
        print("Matched Symptoms:", doc)
        print("Disease:", results['metadatas'][0][i]['disease'])
        print("Dosha:", results['metadatas'][0][i]['dosha'])
        print("Remedies:", json.dumps(results['metadatas'][0][i]['remedy'], indent=2))

Extracted Symptoms: ['headache and stomachache']

Match #1
Matched Symptoms: headache mental dullness inability to focus nausea dizziness burning sensation in stomach loss of appetite
Disease: Hangover (Excess Pitta From Alcohol Consumption)
Dosha: Pitta
Remedies: "herbal formula ingredients jatamamsi: 3 parts nasya therapy description: use bhringaraj oil or brahmi ghee nasya treatment to calm the mind and pitta-related symptoms."

Match #2
Matched Symptoms: runny nose cough congestion headache body aches fever chills loss of appetite poor digestion
Disease: Colds And Flu
Dosha: Vata, Kapha
Remedies: "other recommendations: engage only in mild exercise\u2014gentle yoga asanas such as sun salutation, shoulder stand (\u22641 min), headstand (\u22641 min), and forward bend."

Match #3
Matched Symptoms: frequent liquid stools abdominal discomfort and distension flatulence headache bad breath
Disease: Diarrhea
Dosha: Pitta
Remedies: "hydration recipe: mix 1 tsp sugar, 1 tsp lime juice, pinc