In [1]:
!pip install numpy sentence-transformers bertopic hdbscan nltk scann
import nltk
nltk.download('punkt')
import nltk
nltk.download('punkt_tab')
!pip install sentence-transformers bertopic hdbscan umap-learn scann nltk datasets
!pip install gensim

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting scann
  Downloading scann-1.4.0-cp311-cp311-manylinux_2_27_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transform

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m619.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━

# Topic Generation

In [4]:
# === IMPORTS & SETUP ===
import os
import random
import numpy as np
import torch
import nltk
import logging
import re

from collections import defaultdict, Counter
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from umap import UMAP
import scann

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.metrics import silhouette_score, precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity

# === SEED FIXING ===
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.manual_seed_all(SEED)
torch.use_deterministic_algorithms(True)
nltk.download("punkt")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# === CLEANING & CONTEXT EXTRACTION (IMPROVED) ===
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()


def extract_entity_contexts(chunks, entities_per_chunk, use_multi_sentence=True):
    entity_context_pairs = []
    for idx, ents in enumerate(entities_per_chunk):
        chunk = clean_text(chunks[idx])
        sentences = sent_tokenize(chunk)
        for ent in ents:
            ent_lower = ent.lower()
            matched = False
            for i, sent in enumerate(sentences):
                if ent_lower in sent:
                    context = " ".join(sentences[max(0, i - 1): i + 2]) if use_multi_sentence else sent.strip()
                    enriched = f"The concept '{ent_lower}' appears in the following context: {context}"
                    entity_context_pairs.append((ent_lower, enriched.strip()))
                    matched = True
                    break
            if not matched:
                fallback = f"The concept '{ent_lower}' appears in the following context: {chunk}"
                entity_context_pairs.append((ent_lower, fallback.strip()))
    return entity_context_pairs


# === TOPIC SEARCHER CLASS (WITH DEDUPLICATION, NOISE FILTERING) ===
class AllergyTopicSearcher:
    def __init__(self, chunks, entities_per_chunk, umap_params, hdbscan_params,
                 model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"):
        self.chunks = chunks
        self.entities_per_chunk = entities_per_chunk
        self.embedding_model = SentenceTransformer(model_name)

        self.umap_params = umap_params
        self.hdbscan_params = hdbscan_params

        self.topic_model = None
        self.topic_metadata = []
        self.topic_embeddings = None
        self.searcher = None

        self._prepare()

    def _prepare(self):
        entity_context_pairs = extract_entity_contexts(self.chunks, self.entities_per_chunk)
        contextual_texts = [ctx for _, ctx in entity_context_pairs]
        contextual_embeddings = self.embedding_model.encode(contextual_texts, normalize_embeddings=False)

        umap_model = UMAP(**self.umap_params, random_state=SEED)
        hdbscan_model = HDBSCAN(**self.hdbscan_params, prediction_data=True)

        self.topic_model = BERTopic(
            embedding_model=self.embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            representation_model=KeyBERTInspired(),
            calculate_probabilities=True,
            verbose=False,
        )

        topics, _ = self.topic_model.fit_transform(contextual_texts, embeddings=contextual_embeddings)

        topic_to_contexts = defaultdict(list)
        topic_to_entities = defaultdict(set)
        topic_to_embeddings = defaultdict(list)

        for i, topic in enumerate(topics):
            if topic == -1:
                continue  # Skip noisy topics
            ent, ctx = entity_context_pairs[i]
            topic_to_contexts[topic].append(ctx)
            topic_to_entities[topic].add(ent)
            topic_to_embeddings[topic].append(contextual_embeddings[i])

        topic_embeddings = []
        topic_metadata = []

        for topic_id in topic_to_contexts:
            emb = topic_to_embeddings[topic_id]
            centroid = np.mean(emb, axis=0)
            centroid /= np.linalg.norm(centroid) + 1e-10
            topic_embeddings.append(centroid)
            topic_metadata.append({
                "topic_id": topic_id,
                "entities": list(topic_to_entities[topic_id]),
                "sentences": topic_to_contexts[topic_id],
                "sentence_embeddings": np.array(emb)
            })

        # === OPTIONAL: Merge semantically similar topics (cosine sim > 0.95)
        deduped_metadata = []
        used = set()

        for i, emb_i in enumerate(topic_embeddings):
            if i in used:
                continue
            group = [i]
            sim_scores = cosine_similarity([emb_i], topic_embeddings)[0]
            for j in range(i + 1, len(sim_scores)):
                if sim_scores[j] > 0.95:
                    group.append(j)
                    used.add(j)

            merged = {
                "topic_id": i,
                "sentences": [],
                "entities": [],
                "sentence_embeddings": []
            }
            for g in group:
                merged["sentences"] += topic_metadata[g]["sentences"]
                merged["entities"] += topic_metadata[g]["entities"]
                merged["sentence_embeddings"] += list(topic_metadata[g]["sentence_embeddings"])

            merged["sentence_embeddings"] = np.array(merged["sentence_embeddings"])
            merged["entities"] = list(set(merged["entities"]))
            deduped_metadata.append(merged)

        self.topic_metadata = deduped_metadata
        self.topic_embeddings = np.array([
            np.mean(m["sentence_embeddings"], axis=0) /
            (np.linalg.norm(np.mean(m["sentence_embeddings"], axis=0)) + 1e-10)
            for m in deduped_metadata
        ])

        self.searcher = (
            scann.scann_ops_pybind.builder(self.topic_embeddings, 3, "dot_product")
            .tree(num_leaves=10, num_leaves_to_search=5, training_sample_size=len(self.topic_embeddings))
            .score_brute_force()
            .reorder(5)
            .build()
        )

    import re

    def search(self, query, top_k_topics=3, top_k_sents=3):
        query_emb = self.embedding_model.encode([query], normalize_embeddings=True)[0]
        neighbors, scores = self.searcher.search(query_emb, final_num_neighbors=top_k_topics)

        results = []
        prefix_pattern = r"^the concept '.*?' appears in (the following )?context:\s*"

        for i, idx in enumerate(neighbors):
            meta = self.topic_metadata[idx]
            topic_score = float(scores[i])

            # Deduplicate sentences
            seen = set()
            cleaned_sentences = []
            cleaned_embeddings = []

            for sent, emb in zip(meta["sentences"], meta["sentence_embeddings"]):
            # Apply regex to remove beginning prefix
                cleaned = re.sub(prefix_pattern, "", sent, flags=re.IGNORECASE).strip()

        # No duplicates
                if cleaned not in seen:
                    seen.add(cleaned)
                    cleaned_sentences.append(cleaned)
                    cleaned_embeddings.append(emb)

            if not cleaned_sentences:
                continue

            emb_array = np.array(cleaned_embeddings)
            sims = np.dot(emb_array / np.linalg.norm(emb_array, axis=1, keepdims=True), query_emb)
            top_ids = sims.argsort()[::-1][:top_k_sents]

            top_sents = [(cleaned_sentences[j], float(sims[j])) for j in top_ids]
            results.append({
            "topic_id": meta["topic_id"],
            "topic_score": topic_score,
            "entities": meta["entities"],
            "sentences": top_sents,
            })

        return results




# === METRICS ===
def compute_bertopic_coherence(topic_model, topic_metadata, topk=15):
    topics = [topic_model.get_topic(meta["topic_id"])[:topk] for meta in topic_metadata]
    word_lists = [[word for word, _ in topic] for topic in topics]

    texts = []
    for meta in topic_metadata:
        for s in meta["sentences"]:
            tokens = clean_text(s).split()
            texts.append(tokens)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]

    cm = CoherenceModel(
        topics=word_lists,
        texts=texts,
        dictionary=dictionary,
        coherence="c_v"
    )
    return cm.get_coherence()


def compute_topic_diversity(topic_model, topic_metadata, topk=10):
    topic_words = [topic_model.get_topic(meta["topic_id"])[:topk] for meta in topic_metadata]
    unique_words = set(word for topic in topic_words for word, _ in topic)
    return len(unique_words) / (len(topic_words) * topk)


def compute_silhouette_score_custom(topic_metadata):
    all_embeddings = []
    all_labels = []

    for meta in topic_metadata:
        emb = meta["sentence_embeddings"]
        if len(emb) < 2:  # skip small clusters
            continue
        all_embeddings.extend(emb)
        all_labels.extend([meta["topic_id"]] * len(emb))

    if len(all_embeddings) < 3:
        return None

    all_embeddings = np.vstack(all_embeddings)
    return silhouette_score(all_embeddings, all_labels, metric="cosine")


# === DATASET & INITIALIZATION ===
allergy_dataset = {
  "chunks": [
  "Mr. James H., a 79-year-old male with a long-standing history of cardiovascular and metabolic diseases, was brought to the emergency department due to acute confusion and generalized weakness.",
  "According to his daughter, he had wandered outside disoriented and was unable to identify family members or recall events from the previous day.",
  "He has known medical conditions including hypertension, heart failure with reduced ejection fraction, insulin-dependent diabetes mellitus, stage 4 chronic kidney disease, and major depressive disorder.",
  "His medication regimen includes daily doses of lisinopril, furosemide, carvedilol, insulin glargine, sertraline, and donepezil.",
  "In the past 24 hours, the patient experienced decreased appetite, an episode of vomiting, and two instances of urinary incontinence.",
  "Vital signs upon arrival included a blood pressure of 98/56 mmHg, heart rate of 112 beats per minute (irregularly irregular), respiratory rate of 20, oxygen saturation of 93% on room air, and a temperature of 37.6°C.",
  "Physical examination revealed dry mucous membranes, poor skin turgor, moderate lower limb pitting edema, and delayed capillary refill.",
  "Auscultation of the lungs revealed bilateral basal crackles, and cardiac exam confirmed an irregularly irregular heartbeat without murmurs.",
  "Neurological examination showed fluctuating attention span but no signs of focal deficits or lateralizing neurological signs.",
  "Initial lab studies demonstrated an elevated blood glucose of 421 mg/dL, serum sodium of 129 mmol/L, potassium at 5.7 mmol/L, and creatinine at 2.9 mg/dL.",
  "Serum BUN was elevated at 59 mg/dL and the patient’s anion gap was calculated to be 19, consistent with an anion-gap metabolic acidosis.",
  "Urinalysis revealed glucosuria and ketonuria without signs of infection, and serum ketones were modestly elevated.",
  "His HbA1c on record from two months ago was 8.1%, confirming chronic poor glycemic control.",
  "An ECG showed atrial fibrillation with rapid ventricular response but no acute ischemic changes.",
  "Chest radiograph revealed cardiomegaly and pulmonary vascular congestion with mild bilateral pleural effusions.",
  "CT head without contrast was negative for acute infarct, hemorrhage, or mass effect, but showed chronic microvascular changes.",
  "Given the presentation, he was admitted to the medical ward for acute hyperosmolar hyperglycemic state (HHS) and acute on chronic kidney injury.",
  "A diagnosis of acute delirium, likely secondary to metabolic derangements, volume depletion, and possible infection, was made.",
  "He was started on intravenous normal saline, correctional insulin, and telemetry monitoring.",
  "Furosemide was temporarily held due to volume depletion, and electrolytes were repleted cautiously under nephrology guidance.",
  "Blood cultures, urine cultures, and chest x-ray were obtained to rule out infection as a potential delirium trigger.",
  "Empiric antibiotics (ceftriaxone and azithromycin) were initiated pending culture data due to concern for possible aspiration pneumonia.",
  "On day two, the patient’s mental status began to improve with the resolution of hyperglycemia and normalization of serum osmolarity.",
  "Repeat labs showed trending down of BUN and creatinine, with sodium rising to 134 and potassium corrected to 4.5 mmol/L.",
  "He remained in atrial fibrillation and required continuation of beta-blocker therapy to manage ventricular rate.",
  "Apixaban was continued upon nephrology clearance given acceptable bleeding risk and stable renal function.",
  "He was evaluated by geriatrics for worsening cognitive decline and safety evaluation related to home discharge.",
  "PT/OT performed a bedside mobility assessment showing weakness, unsteadiness, and need for moderate assistance with transfers.",
  "Case management consulted social work regarding home safety, fall prevention, and caregiver support.",
  "His hospital stay was complicated by mild hypoglycemia on hospital day 3, prompting insulin dose adjustments.",
  "Nutritional support was consulted to optimize diabetic-friendly, renal-adjusted diet appropriate for age and mobility.",
  "His depression management was reviewed with psychiatry, and sertraline was continued at 100 mg/day with no suggestion for dose change.",
  "A Montreal Cognitive Assessment (MoCA) was done revealing a score of 19/30, indicating significant mild cognitive impairment.",
  "Audiology was recommended due to hearing difficulty interfering with care discussions.",
  "Oral exam noted poor dentition; dental evaluation was recommended for follow-up to address suspected pain and poor appetite.",
  "After 6 days, the patient was clinically improved, mentally oriented, and ambulatory with the help of physical therapy.",
  "Cardiac and renal parameters stabilized sufficiently to permit safe discharge planning.",
  "The final hospital diagnosis included hyperosmolar hyperglycemic state, volume depletion, acute-on-chronic kidney injury, atrial fibrillation with RVR, and acute delirium.",
  "He was discharged on a simplified diabetic regimen including basal insulin and correctional sliding scale doses only.",
  "Apixaban, carvedilol, donepezil, and sertraline were continued with no changes.",
  "Discharge medication reconciliation included temporary hold of furosemide with plan for outpatient reassessment after fluid status recovery.",
  "Caregiver role was assumed by daughter who had durable power of attorney and assisted with all home-based needs.",
  "Written instructions and red flags for hyperglycemia, dizziness, and recurrent confusion were provided.",
  "A follow-up with his primary care physician, nephrologist, and endocrinologist were scheduled within one and two weeks respectively.",
  "Home health nursing was arranged to provide medication support and glucose monitoring.",
  "Nutritionist and physical therapy were ordered for continued improvement in diet and mobility.",
  "Advanced care planning was briefly discussed including code status, proxy, and end-of-life preferences.",
  "He is currently listed as full code but family is open to further discussion at next provider visit.",
  "Patient was grateful for hospital care and expressed motivation to remain active and well at home.",
  "The overall prognosis remains guarded due to progressive cognitive decline and limited renal reserve.",
  "Close monitoring for new signs of decompensation or medication nonadherence was advised.",
  "Pulmonology follow-up was discussed due to prior mild restrictive spirometry suggestive of early interstitial lung disease.",
  "Family history reveals mother died of complications from dementia and father from ischemic stroke.",
  "No reported use of tobacco, alcohol, or recreational drugs throughout his life.",
  "Lives in a single-story home with grab bars and minimal clutter, although risks for falls still persist.",
  "Wears eyeglasses but rarely uses his hearing aids, sometimes leading to miscommunication or withdrawal.",
  "History of previous admission 1 year ago for pneumonia requiring IV antibiotics and 6-day hospitalization.",
  "Documentation from that admission revealed transient delirium and impaired oral intake similar to current episode.",
  "Goals-of-care conversations were initiated during this admission but deferred for primary care setting follow-up.",
  "Social isolation remains a concern, especially since his wife passed away 3 years ago.",
  "Patient receives Meals on Wheels but misses many meal deliveries due to lack of reliable caregiver at times.",
  "Transportation to medical appointments is provided by his daughter, who balances full-time work responsibilities.",
  "No current enrollment in adult day health programs; options discussed with case management on discharge.",
  "Insurance covers home nursing and outpatient labs but does not cover custodial care.",
  "Patient was educated about Medicare Advantage benefits and encouraged to review covered services with the plan coordinator.",
  "He was also reminded of the importance of daily glucose checks and hydration in summer months.",
  "Foot exam demonstrated mild calluses and intact sensation; he denies new ulcers or foot injuries.",
  "Vaccination status confirmed: received influenza and COVID vaccines last fall, but is due for pneumococcal booster.",
  "Dentition issues may be contributing to decreased intake; dental clinic referral was sent through EHR.",
  "Assistive device for walking was provided (four-point cane) after physical therapy evaluation.",
  "Contact dermatitis on legs due to prolonged pressure and incontinence was treated with barrier cream.",
  "Skin care and bathing guidance were reviewed with family nursing staff prior to discharge.",
  "Patient verbalized understanding of all discharge instructions with support from daughter.",
  "Hospital team closed chart after discussing active problems list, response to therapy, and continued plan.",
  "Patient left the hospital in a wheelchair, accompanied by family, and appeared in good spirits.",
  "The full discharge plan was documented and faxed to his primary provider for continuity of care.",
  "Medication reconciliation showed no potential drug interactions or allergy mismatches.",
  "He was warned against use of NSAIDs due to underlying CKD and risk of acute worsening.",
  "Hydration goals of at least 1.5 liters per day were set; urination logs and symptom review were encouraged.",
  "Emergency instructions included what to do in case of unresponsiveness, low blood glucose, sudden confusion, or chest pain.",
  "Digital blood glucose monitor was reviewed at bedside; daughter demonstrated appropriate calibration and use.",
  "All prescriptions were sent electronically to their local pharmacy located eight blocks from their home.",
  "Patient prefers morning appointments due to increased alertness and energy early in the day.",
  "A follow-up MoCA test was recommended in 3–6 months to assess cognitive trajectory.",
  "Updated advance directives were placed in the chart and a copy was given to the daughter.",
  "Fall prevention strategies were emphasized including appropriate lighting, footwear, and scheduled ambulation.",
  "Use of automatic pill organizers was encouraged to improve adherence across complex medication schedules.",
  "Daily weights will be tracked at home to monitor for unexpected fluid retention or heart failure.",
  "Serum creatinine will be rechecked in one week given borderline rise during admission.",
  "A nephrology note was sent to alert about potential need for long-term planning if GFR continues to decline.",
  "Patient qualifies for shared savings Medicare model and was assigned a care coordinator temporarily.",
  "Patient support group information was handed out, including resources for caregivers.",
  "He is open to exploring telehealth check-ins for medication titration and early symptom triage.",
  "Daughter confirmed she has portal access to review labs and visit summaries on his behalf.",
  "Patient and daughter expressed appreciation for the hospital care coordination team.",
  "Case closed with summary of diagnosis, medications, specialists involved, and plan for 30-day transitional care.",
  "Status post discharge: stable, safe for home, alert and oriented with supervision."
]

,
"entities":[
  ["confusion", "weakness", "cardiovascular", "metabolic"],
  ["disorientation", "memory"],
  ["hypertension", "failure", "diabetes", "kidney", "depression"],
  ["lisinopril", "furosemide", "carvedilol", "insulin", "sertraline", "donepezil"],
  ["appetite", "vomiting", "incontinence"],
  ["pressure", "rate", "rhythm", "respiration", "saturation", "temperature"],
  ["mucosa", "turgor", "edema", "refill"],
  ["crackles", "heartbeat", "murmurs"],
  ["attention", "deficits"],
  ["glucose", "sodium", "potassium", "creatinine"],
  ["bun", "acidosis"],
  ["glucosuria", "ketonuria", "ketones"],
  ["hba1c", "control"],
  ["ecg", "fibrillation", "response", "ischemia"],
  ["cardiomegaly", "congestion", "effusions"],
  ["infarct", "hemorrhage", "microvascular"],
  ["hyperglycemia", "injury"],
  ["delirium", "derangements", "infection"],
  ["saline", "insulin", "telemetry"],
  ["furosemide", "depletion", "electrolytes"],
  ["cultures", "infection"],
  ["antibiotics", "ceftriaxone", "azithromycin", "pneumonia"],
  ["status", "hyperglycemia", "osmolarity"],
  ["bun", "creatinine", "sodium", "potassium"],
  ["fibrillation", "rate", "blocker"],
  ["apixaban", "function", "bleeding"],
  ["geriatrics", "cognition"],
  ["pt", "ot", "mobility", "weakness", "transfers"],
  ["safety", "falls"],
  ["hypoglycemia", "insulin"],
  ["nutrition", "diet"],
  ["depression", "psychiatry", "sertraline"],
  ["moca", "impairment"],
  ["audiology", "hearing"],
  ["dentition", "pain"],
  ["therapy", "ambulation"],
  ["parameters"],
  ["hyperglycemia", "depletion", "injury", "fibrillation", "delirium"],
  ["regimen", "insulin"],
  ["apixaban", "carvedilol", "donepezil", "sertraline"],
  ["reconciliation", "furosemide"],
  ["power"],
  ["hyperglycemia", "dizziness", "confusion"],
  ["nephrologist", "endocrinologist"],
  ["nursing", "glucose"],
  ["nutritionist", "therapy"],
  ["planning", "status", "proxy"],
  ["code"],
  ["prognosis", "cognition", "reserve"],
  ["monitoring", "decompensation", "adherence"],
  ["pulmonology", "spirometry", "disease"],
  ["dementia", "stroke"],
  ["tobacco", "alcohol", "drugs"],
  ["falls"],
  ["hearing"],
  ["pneumonia", "antibiotics"],
  ["delirium"],
  ["conversations"],
  ["isolation"],
  ["meals"],
  ["transportation"],
  ["enrollment"],
  ["insurance", "nursing", "labs"],
  ["medicare"],
  ["glucose", "hydration"],
  ["exam", "calluses", "ulcers"],
  ["vaccination", "influenza", "covid", "booster"],
  ["dentition", "referral"],
  ["cane"],
  ["dermatitis", "cream"],
  ["skin"],
  ["instructions"],
  ["problems", "therapy", "plan"],
  ["wheelchair"],
  ["continuity"],
  ["reconciliation", "interactions", "allergies"],
  ["nsaids"],
  ["hydration", "urination", "symptoms"],
  ["instructions", "glucose", "confusion", "pain"],
  ["monitor", "calibration"],
  ["prescriptions", "pharmacy"],
  ["appointments", "alertness", "energy"],
  ["moca"],
  ["directives"],
  ["prevention", "lighting", "footwear", "ambulation"],
  ["organizer", "adherence"],
  ["weight", "retention", "failure"],
  ["creatinine"],
  ["nephrology", "gfr"],
  ["medicare", "coordinator"],
  ["group", "caregivers"],
  ["telehealth", "titration", "triage"],
  ["portal", "labs", "summaries"],
  ["coordination"],
  ["diagnosis", "medications", "specialists", "care"],
  ["discharge", "supervision"]
]



}

best_umap = {"n_neighbors": 5, "n_components": 5, "min_dist":0.25,  "metric": "cosine"}
best_hdbscan = {"min_cluster_size": 2, "min_samples": 1, "metric": "euclidean"}

searcher = AllergyTopicSearcher(
    chunks=allergy_dataset["chunks"],
    entities_per_chunk=allergy_dataset["entities"],
    umap_params=best_umap,
    hdbscan_params=best_hdbscan,
    model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
)
print("✅ Model ready for querying.")
# print("\n=== 🧠 Generated Topics and Entities ===")
# for meta in searcher.topic_metadata:
#     topic_id = meta["topic_id"]
#     entities = ", ".join(meta["entities"])
#     print(f"🔹 Topic ID: {topic_id} — Entities: {entities}")

# === METRICS ===
coherence = compute_bertopic_coherence(searcher.topic_model, searcher.topic_metadata, topk=15)
diversity = compute_topic_diversity(searcher.topic_model, searcher.topic_metadata, topk=10)
sil_score = compute_silhouette_score_custom(searcher.topic_metadata)

print("\n=== Topic Quality Metrics ===")
print(f"🧪 Coherence Score (c_v): {coherence:.4f}")
print(f"🌈 Topic Diversity: {diversity:.4f}")
if sil_score is not None:
    print(f"📐 Silhouette Score: {sil_score:.4f}")
else:
    print("📐 Silhouette Score: Not applicable.")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model ready for querying.

=== Topic Quality Metrics ===
🧪 Coherence Score (c_v): 0.5826
🌈 Topic Diversity: 0.4672
📐 Silhouette Score: 0.5386


# Topic labeling

# Data read from the big query


In [3]:
import numpy as np
import pandas as pd
from google.cloud import bigquery
from sklearn.metrics.pairwise import cosine_similarity


def normalize_vectors(vectors: np.ndarray) -> np.ndarray:

    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    # Avoid division by zero by replacing zeros with small epsilon
    norms = np.where(norms == 0, 1e-10, norms)
    return vectors / norms


# === 1. Load CUI definitions and embeddings from BigQuery ===
project_id = "YOUR_PROJECT_ID"
dataset_id = "YOUR_DATASET"
table_id = "YOUR_TABLE_NAME"
table_ref = f"`{project_id}.{dataset_id}.{table_id}`"

client = bigquery.Client(project=project_id)
query = f"SELECT cui, definition, embedding FROM {table_ref}"
df_cui = client.query(query).to_dataframe()

# Stack all embeddings into np.array of shape (num_concepts, embedding_dim)
cui_embeddings = np.vstack(df_cui['embedding'].values).astype(float)

# Normalize CUI embeddings here explicitly
cui_embeddings = normalize_vectors(cui_embeddings)

cui_defs = df_cui['definition'].tolist()

top_k = 3  # Number of top candidate labels for each topic

rows = []

# === 2. LOOP OVER TOPICS from your AllergyTopicSearcher instance ===
# Assumes searcher.topic_metadata elements with keys:
# 'topic_id', 'entities', 'sentence_embeddings' (np.array, shape=(num_entities, embedding_dim))
for meta in searcher.topic_metadata:
    topic_id = meta["topic_id"]
    entities = meta["entities"]

    # Normalize all sentence embeddings before averaging
    sentence_embeddings_normalized = normalize_vectors(meta["sentence_embeddings"])

    # Compute average topic embedding and normalize again
    topic_emb = np.mean(sentence_embeddings_normalized, axis=0)
    topic_emb /= (np.linalg.norm(topic_emb) + 1e-10)

    # Compute cosine similarity: topic embedding vs all CUI embeddings (dot product since normalized)
    topic_term_sims = np.dot(cui_embeddings, topic_emb)

    # Clip similarity scores to [-1, 1]
    topic_term_sims = np.clip(topic_term_sims, -1.0, 1.0)

    # Select top-k candidate label indices sorted by similarity descending
    top_indices = topic_term_sims.argsort()[::-1][:top_k]

    candidate_embeddings = cui_embeddings[top_indices]
    candidate_defs = [cui_defs[i] for i in top_indices]
    candidate_similarities = topic_term_sims[top_indices]

    # Compute pairwise cosine similarity among top-k candidate labels
    pairwise_sims = cosine_similarity(candidate_embeddings)
    off_diag_mask = ~np.eye(pairwise_sims.shape[0], dtype=bool)
    off_diagonal_sims = pairwise_sims[off_diag_mask]
    topic_label_similarity = round(float(np.mean(off_diagonal_sims)), 2) if off_diagonal_sims.size > 0 else None

    # Assign each entity to the closest candidate label based on similarity
    entity_assignments = []
    entity_scores = []

    if entities and len(meta["sentence_embeddings"]) == len(entities):
        # Normalize entity embeddings as well
        entity_embeddings_normalized = normalize_vectors(meta["sentence_embeddings"])

        # Compute similarity matrix (entities x candidate labels)
        sims_entities_candidates = np.dot(entity_embeddings_normalized, candidate_embeddings.T)

        # Clip similarity scores
        sims_entities_candidates = np.clip(sims_entities_candidates, -1.0, 1.0)

        for i, ent in enumerate(entities):
            best_idx = sims_entities_candidates[i].argmax()
            best_score = float(sims_entities_candidates[i][best_idx])
            assigned_label = candidate_defs[best_idx]
            entity_assignments.append({
                "entity": ent,
                "assigned_label_definition": assigned_label,
                "sim_score": round(best_score, 2)
            })
            entity_scores.append(best_score)

        entity_label_score = round(float(np.mean(entity_scores)), 2) if entity_scores else None
    else:
        entity_label_score = None
        entity_assignments = []

    # Prepare topic-level label list with definition and similarity
    topic_level_labels = [(candidate_defs[i], round(float(candidate_similarities[i]), 2)) for i in range(len(top_indices))]

    rows.append({
        "topic_id": topic_id,
        "entities": entities,
        "topic_level_labels": topic_level_labels,  # list of (definition, similarity)
        "topic_Avg_label_similarity": topic_label_similarity,
        "entity_level_assignments": entity_assignments,  # list of dicts {entity, assigned_label_definition, sim_score}
        "entity_Avg_label_score": entity_label_score
    })

# === 3. Create pandas DataFrame and display ===
df = pd.DataFrame(rows)
pd.set_option('display.max_colwidth', None)

df
