In [None]:
#Without LLM approach

In [None]:
# Google Cloud Document AI client
!pip install google-cloud-documentai

# Google Cloud BigQuery (for reading CUI vectors)
!pip install pandas-gbq google-cloud-bigquery

# Numpy
!pip install numpy

# Google GenAI SDK (for Vertex AI embeddings)
!pip install google-genai

# Optional but useful: pandas (for dataframe handling)
!pip install pandas


In [None]:
# ==============================================
# Complete Code: DocAI Extraction + Context Pyramid + CUI Graph
# ==============================================
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional, Any
import numpy as np
import re
from datetime import datetime
import os
import json

# ========== DocAI imports ==========
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1 import DocumentProcessorServiceClient

# ========== Your CONFIG ==========
PROJECT_ID = "YOUR_GCP_PROJECT"
LOCATION = "us"  # or your DocAI location ("us", "eu", "global", etc)
BQ_CUI_TABLE = "your_project.your_dataset.cui_embeddings"
DOCAI_PROCESSOR_ID = "your-docai-processor-id"

# Set environment variables for Vertex AI embeddings
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

# ========== Vertex AI embeddings ==========
from google import genai
from google.genai.types import EmbedContentConfig
_client = genai.Client()

def gemini_embed(texts: List[str], *, task_type: str="RETRIEVAL_DOCUMENT") -> np.ndarray:
    """Context-aware embeddings using Vertex AI Gemini."""
    cfg = EmbedContentConfig(task_type=task_type)
    resp = _client.models.embed_content(
        model="gemini-embedding-001",
        contents=texts,
        config=cfg,
    )
    return np.array([e.values for e in resp.embeddings], dtype=np.float32)

# ========== BigQuery CUI vectors ==========
import pandas_gbq
def load_cui_vectors_bq(table_fqn: str = BQ_CUI_TABLE) -> Dict[str, np.ndarray]:
    df = pandas_gbq.read_gbq(f"SELECT cui, embedding FROM `{table_fqn}`", project_id=PROJECT_ID)
    return {row["cui"]: np.array(row["embedding"], dtype=np.float32) for _, row in df.iterrows()}

CUI_VECTORS = load_cui_vectors_bq(BQ_CUI_TABLE)

# ========== UMLS linker stub ==========
def umls_link(text: str) -> List[Dict[str, Any]]:
    """Replace with your UMLS linker; return list of {cui, score}."""
    return []

# ========== DocAI Extraction ==========
def extract_docai_document(file_path: str) -> dict:
    """
    Extracts document using Google Cloud Document AI and returns the parsed JSON.
    Args:
      file_path: path to local file (PDF, TIFF, etc.)

    Returns:
      dict: DocAI document response.
    """
    client = DocumentProcessorServiceClient()

    # Full resource name of the processor
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{DOCAI_PROCESSOR_ID}"

    # Read the file into memory
    with open(file_path, "rb") as f:
        file_content = f.read()

    raw_document = documentai.RawDocument(content=file_content, mime_type="application/pdf")

    request = documentai.ProcessRequest(
        name=name,
        raw_document=raw_document
    )

    result = client.process_document(request=request)

    document = result.document
    # Convert protobuf Document to dict/json
    doc_dict = json.loads(document.to_json())
    return doc_dict

# ========== Date regex and normalization ==========
DATE_REGEX = re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})(?:[ ,;]+(\d{1,2}[:.]\d{2}))?\b")

def norm_date(s: str) -> Optional[str]:
    s = s.strip().replace('.', ':').replace(',', ' ')
    try:
        for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
            try:
                return datetime.strptime(s.split()[0], fmt).strftime("%Y-%m-%d")
            except Exception:
                pass
        m = DATE_REGEX.search(s)
        if m:
            raw = m.group(1)
            for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
                try:
                    return datetime.strptime(raw, fmt).strftime("%Y-%m-%d")
                except Exception:
                    pass
    except Exception:
        pass
    return None

# ========== Data classes ==========
@dataclass
class Section:
    section_id: str
    page: int
    title: str
    text_snippet: str
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

@dataclass
class Entity:
    entity_id: str
    page: int
    text: str
    kind: str
    section_title: Optional[str]
    value_norm_date: Optional[str]
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

# ========== Parse DocAI response into pyramid ==========
def parse_docai_to_pyramid(docai: Dict):
    pages = docai.get("pages", [])
    sections: List[Section] = []
    entities: List[Entity] = []
    headers_by_page: Dict[int, List[str]] = {}

    for p_idx, page in enumerate(pages):
        headers_by_page[p_idx] = []

        # Form fields (KVs)
        for ff in page.get("formFields", []) or []:
            name = ff.get("fieldName", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldName", {}).get("content", "")
            value = ff.get("fieldValue", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldValue", {}).get("content", "")
            kv_text = f"{name.strip()}: {value.strip()}".strip(": ")
            entities.append(Entity(
                entity_id=f"kv:{p_idx}:{len(entities)}",
                page=p_idx, text=kv_text, kind="kv",
                section_title=None, value_norm_date=norm_date(kv_text)
            ))
            if DATE_REGEX.search(kv_text or ""):
                entities.append(Entity(
                    entity_id=f"date:{p_idx}:{len(entities)}",
                    page=p_idx, text=kv_text, kind="date",
                    section_title=None, value_norm_date=norm_date(kv_text)
                ))

        # Lines → headings/sections
        current_title = None
        buf: List[str] = []
        def flush():
            nonlocal buf, current_title, sections
            if current_title and buf:
                text = " ".join(buf)[:1000]
                sections.append(Section(
                    section_id=f"sec:{len(sections)}",
                    page=p_idx, title=current_title.upper(),
                    text_snippet=text
                ))
                buf = []
        for ln in page.get("lines", []) or []:
            t = ln.get("layout", {}).get("textAnchor", {}).get("content", "") or ln.get("layout", {}).get("content", "")
            if not t:
                continue
            if t.endswith(":") or (t.isupper() and len(t) <= 40):
                flush()
                current_title = t.strip().strip(":")
                headers_by_page[p_idx].append(current_title)
            else:
                if current_title:
                    buf.append(t)
                    for m in DATE_REGEX.finditer(t or ""):
                        entities.append(Entity(
                            entity_id=f"datei:{p_idx}:{len(entities)}",
                            page=p_idx, text=t, kind="date",
                            section_title=current_title.upper(),
                            value_norm_date=norm_date(m.group(0))
                        ))
        flush()

    # Deterministic context strings (no LLM)
    doc_headers = list(dict.fromkeys([h for lst in headers_by_page.values() for h in lst]))
    doc_ctx = "[DOC] " + " | ".join(doc_headers)[:800]
    page_ctx = {p: "[PAGE] " + " | ".join(headers_by_page.get(p, []))[:400] for p in headers_by_page}
    return sections, entities, doc_ctx, page_ctx

# ========== Attach CUIs and context embeddings ==========
def attach_cuis_and_context_embeddings(sections: List[Section], entities: List[Entity],
                                       doc_ctx: str, page_ctx: Dict[int, str]) -> None:
    texts, idx = [], []

    # Sections
    for s in sections:
        cand = umls_link(f"{s.title}: {s.text_snippet}")
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            s.cui = best["cui"]
        if s.cui and s.cui in CUI_VECTORS:
            s.emb = CUI_VECTORS[s.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(s.page,'')}\n[SECTION] {s.title}\n[TEXT] {s.text_snippet[:600]}"
            texts.append(t); idx.append(("section", s))

    # Entities (dates/KVs)
    for e in entities:
        cand = umls_link(e.text)
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            e.cui = best["cui"]
        if e.cui and e.cui in CUI_VECTORS:
            e.emb = CUI_VECTORS[e.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(e.page,'')}\n[SECTION] {e.section_title or 'NA'}\n[ENTITY:{e.kind}] {e.text[:400]}"
            texts.append(t); idx.append(("entity", e))

    if texts:
        embs = gemini_embed(texts)
        for (kind, obj), vec in zip(idx, embs):
            obj.emb = vec

# ========== Similarity and scoring ==========
def cosine(a: np.ndarray, b: np.ndarray) -> float:
    da = float(np.linalg.norm(a) + 1e-8); db = float(np.linalg.norm(b) + 1e-8)
    return float(np.dot(a, b) / (da * db))

def page_has_init_same_day(entities: List[Entity], day_iso: str, page: int) -> bool:
    for e in entities:
        if e.page == page and e.kind == "date" and e.value_norm_date == day_iso:
            if "initialization" in (e.text or "").lower() or "init" in (e.text or "").lower():
                return True
    return False

def same_day_score(day_iso: Optional[str], section: Section, entities_all: List[Entity]) -> float:
    if not day_iso:
        return 0.0
    for m in DATE_REGEX.finditer(section.text_snippet or ""):
        if norm_date(m.group(0)) == day_iso:
            return 1.0
    return 0.6 if page_has_init_same_day(entities_all, day_iso, section.page) else 0.0

def header_semantics(title: str) -> float:
    t = (title or "").lower()
    if "procedure" in t or "operation" in t:
        return 1.0
    if "brief history" in t or "clinical summary" in t or "history" in t:
        return 0.8
    return 0.3

def association_score(date_ent: Entity, section: Section, day_iso: str, entities_all: List[Entity]) -> Tuple[float, Dict[str,float]]:
    S_local = cosine(date_ent.emb, section.emb) if (date_ent.emb is not None and section.emb is not None) else 0.0
    T_same = same_day_score(day_iso, section, entities_all)
    H_sem  = header_semantics(section.title)
    P_page = 0.2 if date_ent.page == section.page else 0.0
    score = (0.35*S_local) + (0.30*T_same) + (0.25*H_sem) + (0.10*P_page)
    return float(score), {"S_local": S_local, "T_same": T_same, "H_sem": H_sem, "P_page": P_page}

# ========== Build model context ==========
def build_model_context(query: str, day_iso: Optional[str], sections: List[Section], entities: List[Entity],
                        top_n: int = 4) -> Tuple[str, List[Section], Dict[str,float]]:
    query_emb = gemini_embed([query])[0]
    # Rank sections by cosine to query embedding
    ranked_secs = sorted(
        [(cosine(query_emb, s.emb), s) for s in sections if s.emb is not None],
        key=lambda x: x[0], reverse=True
    )
    selected_secs: List[Section] = []
    score_breakdown = {}

    # Include sections associated with date entities (like "initialization" or exact date matches)
    for sscore, s in ranked_secs:
        # Find date entities on the same page
        date_ents = [e for e in entities if e.page == s.page and e.kind == "date" and e.value_norm_date == day_iso]
        assoc_scores = [association_score(de, s, day_iso, entities) for de in date_ents]
        if assoc_scores:
            best_score, breakdown = max(assoc_scores, key=lambda x: x[0])
            score_breakdown[s.section_id] = best_score
            if best_score > 0.5 and len(selected_secs) < top_n:
                selected_secs.append(s)
        elif len(selected_secs) < top_n:
            selected_secs.append(s)
        if len(selected_secs) >= top_n:
            break

    # Build final context string
    context_str = "\n---\n".join([f"[SECTION] {s.title}\n{s.text_snippet[:700]}" for s in selected_secs])
    return context_str, selected_secs, score_breakdown

# ========== Query the pyramid ==========
def query_context_cui(query: str, day_iso: Optional[str], sections: List[Section], entities: List[Entity],
                      top_n: int = 4):
    context_str, selected_secs, _ = build_model_context(query, day_iso, sections, entities, top_n)
    # For demo, just print results; replace with your LLM prompt as needed
    print(f"Query: {query} | Date: {day_iso}\n")
    print("Selected context sections:")
    for s in selected_secs:
        print(f" - {s.title} (Page {s.page}): {s.text_snippet[:150]}...")
    print("\nFull context passed to LLM:\n", context_str)

# ========== MAIN ==========
def main():
    # 1. Extract DocAI JSON from file
    file_path = "your_document.pdf"  # Replace with your PDF path
    print(f"Extracting document via DocAI: {file_path} ...")
    docai_json = extract_docai_document(file_path)

    # 2. Parse to pyramid
    print("Parsing DocAI response to sections/entities ...")
    sections, entities, doc_ctx, page_ctx = parse_docai_to_pyramid(docai_json)

    # 3. Attach embeddings and CUIs
    print("Attaching CUIs and embeddings ...")
    attach_cuis_and_context_embeddings(sections, entities, doc_ctx, page_ctx)

    # 4. Run query example
    query_text = "What procedure was done on 02/21/2022?"
    day_iso = "2022-02-21"  # Example normalized date

    print("\nQuerying context pyramid ...")
    query_context_cui(query_text, day_iso, sections, entities)

if __name__ == "__main__":
    main()


In [None]:
#Use below code if we have already document output generated
!pip install google-cloud-documentai
!pip install google-cloud-aiplatform
# or if you want the latest genai package
!pip install google-genai

In [None]:
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional, Any
import numpy as np
import re
from datetime import datetime
import os

# ----------------------------
# CONFIG
# ----------------------------
PROJECT_ID = "YOUR_GCP_PROJECT"
LOCATION = "global"
BQ_CUI_TABLE = "your_project.your_dataset.cui_embeddings"

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

from google import genai
from google.genai.types import EmbedContentConfig, TextGenerationConfig

import pandas_gbq

_client = genai.Client()

# ----------------------------
# Utility functions & classes
# ----------------------------

DATE_REGEX = re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})(?:[ ,;]+(\d{1,2}[:.]\d{2}))?\b")

def norm_date(s: str) -> Optional[str]:
    s = s.strip().replace('.', ':').replace(',', ' ')
    try:
        for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
            try:
                return datetime.strptime(s.split()[0], fmt).strftime("%Y-%m-%d")
            except Exception:
                pass
        m = DATE_REGEX.search(s)
        if m:
            raw = m.group(1)
            for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
                try:
                    return datetime.strptime(raw, fmt).strftime("%Y-%m-%d")
                except Exception:
                    pass
    except Exception:
        pass
    return None

@dataclass
class Section:
    section_id: str
    page: int
    title: str
    text_snippet: str
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

@dataclass
class Entity:
    entity_id: str
    page: int
    text: str
    kind: str
    section_title: Optional[str]
    value_norm_date: Optional[str]
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

# ----------------------------
# Load CUI vectors from BigQuery
# ----------------------------
def load_cui_vectors_bq(table_fqn: str = BQ_CUI_TABLE) -> Dict[str, np.ndarray]:
    df = pandas_gbq.read_gbq(f"SELECT cui, embedding FROM `{table_fqn}`", project_id=PROJECT_ID)
    return {row["cui"]: np.array(row["embedding"], dtype=np.float32) for _, row in df.iterrows()}

CUI_VECTORS = load_cui_vectors_bq(BQ_CUI_TABLE)

# ----------------------------
# Embeddings - Gemini embedding API
# ----------------------------
def gemini_embed(texts: List[str], *, task_type: str="RETRIEVAL_DOCUMENT") -> np.ndarray:
    """Context-aware embeddings using Vertex AI Gemini."""
    cfg = EmbedContentConfig(task_type=task_type)
    resp = _client.models.embed_content(
        model="gemini-embedding-001",
        contents=texts,
        config=cfg,
    )
    return np.array([e.values for e in resp.embeddings], dtype=np.float32)

# ----------------------------
# Stub: UMLS linker - Replace with your own
# ----------------------------
def umls_link(text: str) -> List[Dict[str, Any]]:
    """Return list of {cui, score} linking text to CUIs."""
    # For now, empty. Implement your UMLS/CUI linking here.
    return []

# ----------------------------
# Parse DocAI response into sections and entities
# ----------------------------
def parse_docai_to_pyramid(docai: Dict):
    pages = docai.get("document", {}).get("pages", [])
    sections: List[Section] = []
    entities: List[Entity] = []
    headers_by_page: Dict[int, List[str]] = {}

    for p_idx, page in enumerate(pages):
        headers_by_page[p_idx] = []
        # Key-Value pairs as entities
        for ff in page.get("formFields", []) or []:
            name = ff.get("fieldName", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldName", {}).get("content", "")
            value = ff.get("fieldValue", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldValue", {}).get("content", "")
            kv_text = f"{name.strip()}: {value.strip()}".strip(": ")
            entities.append(Entity(
                entity_id=f"kv:{p_idx}:{len(entities)}",
                page=p_idx, text=kv_text, kind="kv",
                section_title=None, value_norm_date=norm_date(kv_text)
            ))
            if DATE_REGEX.search(kv_text or ""):
                entities.append(Entity(
                    entity_id=f"date:{p_idx}:{len(entities)}",
                    page=p_idx, text=kv_text, kind="date",
                    section_title=None, value_norm_date=norm_date(kv_text)
                ))
        # Lines → headings/sections
        current_title = None
        buf: List[str] = []
        def flush():
            nonlocal buf, current_title, sections
            if current_title and buf:
                text = " ".join(buf)[:1000]
                sections.append(Section(
                    section_id=f"sec:{len(sections)}",
                    page=p_idx, title=current_title.upper(),
                    text_snippet=text
                ))
                buf = []
        for ln in page.get("lines", []) or []:
            t = ln.get("layout", {}).get("textAnchor", {}).get("content", "") or ln.get("layout", {}).get("content", "")
            if not t:
                continue
            if t.endswith(":") or (t.isupper() and len(t) <= 40):
                flush()
                current_title = t.strip().strip(":")
                headers_by_page[p_idx].append(current_title)
            else:
                if current_title:
                    buf.append(t)
                    for m in DATE_REGEX.finditer(t or ""):
                        entities.append(Entity(
                            entity_id=f"datei:{p_idx}:{len(entities)}",
                            page=p_idx, text=t, kind="date",
                            section_title=current_title.upper(),
                            value_norm_date=norm_date(m.group(0))
                        ))
        flush()

    # Deterministic context strings (no LLM)
    doc_headers = list(dict.fromkeys([h for lst in headers_by_page.values() for h in lst]))
    doc_ctx = "[DOC] " + " | ".join(doc_headers)[:800]
    page_ctx = {p: "[PAGE] " + " | ".join(headers_by_page.get(p, []))[:400] for p in headers_by_page}
    return sections, entities, doc_ctx, page_ctx

# ----------------------------
# Attach CUIs and embeddings
# ----------------------------
def attach_cuis_and_context_embeddings(sections: List[Section], entities: List[Entity],
                                       doc_ctx: str, page_ctx: Dict[int, str]) -> None:
    texts, idx = [], []

    # Sections
    for s in sections:
        cand = umls_link(f"{s.title}: {s.text_snippet}")
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            s.cui = best["cui"]
        if s.cui and s.cui in CUI_VECTORS:
            s.emb = CUI_VECTORS[s.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(s.page,'')}\n[SECTION] {s.title}\n[TEXT] {s.text_snippet[:600]}"
            texts.append(t); idx.append(("section", s))

    # Entities (dates/KVs)
    for e in entities:
        cand = umls_link(e.text)
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            e.cui = best["cui"]
        if e.cui and e.cui in CUI_VECTORS:
            e.emb = CUI_VECTORS[e.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(e.page,'')}\n[SECTION] {e.section_title or 'NA'}\n[ENTITY:{e.kind}] {e.text[:400]}"
            texts.append(t); idx.append(("entity", e))

    if texts:
        embs = gemini_embed(texts)
        for (kind, obj), vec in zip(idx, embs):
            obj.emb = vec

# ----------------------------
# Cosine similarity
# ----------------------------
def cosine(a: np.ndarray, b: np.ndarray) -> float:
    da = float(np.linalg.norm(a) + 1e-8); db = float(np.linalg.norm(b) + 1e-8)
    return float(np.dot(a, b) / (da * db))

# ----------------------------
# Build section similarity graph (in-memory)
# ----------------------------
def build_similarity_graph(sections: List[Section]) -> Dict[Tuple[str, str], float]:
    """
    Build a similarity graph where edges are cosine similarity > threshold
    Returns dict of edge tuples and similarity score
    """
    sec_with_emb = [s for s in sections if s.emb is not None]
    S = {}
    threshold = 0.75  # similarity threshold, tune as needed

    for i in range(len(sec_with_emb)):
        for j in range(i + 1, len(sec_with_emb)):
            a, b = sec_with_emb[i], sec_with_emb[j]
            sim = cosine(a.emb, b.emb)
            if sim > threshold:
                S[(a.section_id, b.section_id)] = sim
                S[(b.section_id, a.section_id)] = sim
    return S

# ----------------------------
# Query embedding and search
# ----------------------------
def query_similar_sections(query: str, sections: List[Section], top_k=5) -> List[Tuple[Section, float]]:
    """
    Embed the query, then find top_k most similar sections by cosine similarity
    """
    q_emb = gemini_embed([query])[0]
    scored = []
    for s in sections:
        if s.emb is not None:
            sim = cosine(q_emb, s.emb)
            scored.append((s, sim))
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_k]

# ----------------------------
# Generate answer with LLM
# ----------------------------
def generate_answer_llm(query: str, relevant_sections: List[Section]) -> str:
    """
    Compose prompt from query and relevant context, then generate answer via Vertex AI Gemini LLM
    """
    context_text = "\n---\n".join(f"[{s.title}]\n{s.text_snippet}" for s in relevant_sections)
    prompt = f"Using the following context from the document, answer the query:\n\nContext:\n{context_text}\n\nQuery:\n{query}\n\nAnswer:"

    resp = _client.predict(
        model="gemini-002",
        parameters=TextGenerationConfig(max_tokens=256, temperature=0.0),
        prompt=prompt,
    )
    return resp.generations[0].text.strip()

# ----------------------------
# Example main flow
# ----------------------------
def main(docai_response: Dict, user_query: str) -> str:
    # Parse DocAI output
    sections, entities, doc_ctx, page_ctx = parse_docai_to_pyramid(docai_response)

    # Attach CUIs and embeddings
    attach_cuis_and_context_embeddings(sections, entities, doc_ctx, page_ctx)

    # Build similarity graph (optional, for exploration)
    graph = build_similarity_graph(sections)

    # Find relevant sections for the user query
    relevant_sections = [s for s, score in query_similar_sections(user_query, sections)]

    # Generate an LLM answer
    answer = generate_answer_llm(user_query, relevant_sections)
    return answer

# ----------------------------
# If you want to test:
----------------------------
docai_resp = ... # Load your DocAI JSON output here
query = "diagnosis of diabetes"
print(main(docai_resp, query))


In [None]:
#use this code for complete pipelinbe inlcuding LLM

In [None]:
from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional, Any
import numpy as np
import re
from datetime import datetime
import os
import json

# Google imports
from google.cloud import documentai_v1 as documentai
from google import genai
from google.genai.types import EmbedContentConfig, TextGenerationConfig
import pandas_gbq

# ----------------------------
# CONFIG (fill in your own)
# ----------------------------
PROJECT_ID = "YOUR_GCP_PROJECT"
LOCATION = "us"  # or your processor location
PROCESSOR_ID = "YOUR_PROCESSOR_ID"
GCS_INPUT_URI = "gs://your-bucket/your-file.pdf"
BQ_CUI_TABLE = "your_project.your_dataset.cui_embeddings"

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

# Clients
docai_client = documentai.DocumentProcessorServiceClient()
genai_client = genai.Client()

# ----------------------------
# 1. DocAI Extraction (from first script)
# ----------------------------

def get_text(text_anchor, document_text) -> str:
    if not text_anchor or not text_anchor.text_segments:
        return ""
    text = ""
    for segment in text_anchor.text_segments:
        start = segment.start_index or 0
        end = segment.end_index or 0
        text += document_text[start:end]
    return text.strip()

def run_docai_extraction() -> Dict:
    name = f"projects/{PROJECT_ID}/locations/{LOCATION}/processors/{PROCESSOR_ID}"
    gcs_document = documentai.GcsDocument(gcs_uri=GCS_INPUT_URI, mime_type="application/pdf")
    input_config = documentai.BatchDocumentsInputConfig(gcs_documents=documentai.GcsDocuments(documents=[gcs_document]))
    request = documentai.ProcessRequest(name=name, input_documents=input_config)
    result = docai_client.process_document(request=request)
    document = result.document

    output_data = {
        "text": document.text,
        "document": {
            "pages": []
        }
    }

    for page in document.pages:
        page_text = get_text(page.layout.text_anchor, document.text)
        page_data = {
            "pageNumber": page.page_number,
            "formFields": [],
            "lines": [],
        }
        # formFields
        for field in page.form_fields:
            field_name = get_text(field.field_name.text_anchor, document.text)
            field_value = get_text(field.field_value.text_anchor, document.text)
            page_data["formFields"].append({
                "fieldName": {"textAnchor": {"content": field_name}},
                "fieldValue": {"textAnchor": {"content": field_value}},
            })
        # lines (for parser)
        for line in page.lines:
            content = get_text(line.layout.text_anchor, document.text)
            page_data["lines"].append({
                "layout": {"textAnchor": {"content": content}},
                "text": content
            })
        output_data["document"]["pages"].append(page_data)

    return output_data

# ----------------------------
# 2. Everything else from second script (unchanged)
# ----------------------------

DATE_REGEX = re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})(?:[ ,;]+(\d{1,2}[:.]\d{2}))?\b")

def norm_date(s: str) -> Optional[str]:
    s = s.strip().replace('.', ':').replace(',', ' ')
    try:
        for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
            try:
                return datetime.strptime(s.split()[0], fmt).strftime("%Y-%m-%d")
            except Exception:
                pass
        m = DATE_REGEX.search(s)
        if m:
            raw = m.group(1)
            for fmt in ("%m/%d/%Y", "%m/%d/%y", "%m-%d-%Y", "%m-%d-%y"):
                try:
                    return datetime.strptime(raw, fmt).strftime("%Y-%m-%d")
                except Exception:
                    pass
    except Exception:
        pass
    return None

@dataclass
class Section:
    section_id: str
    page: int
    title: str
    text_snippet: str
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

@dataclass
class Entity:
    entity_id: str
    page: int
    text: str
    kind: str
    section_title: Optional[str]
    value_norm_date: Optional[str]
    emb: Optional[np.ndarray] = None
    cui: Optional[str] = None

def load_cui_vectors_bq(table_fqn: str = BQ_CUI_TABLE) -> Dict[str, np.ndarray]:
    df = pandas_gbq.read_gbq(f"SELECT cui, embedding FROM `{table_fqn}`", project_id=PROJECT_ID)
    return {row["cui"]: np.array(row["embedding"], dtype=np.float32) for _, row in df.iterrows()}

CUI_VECTORS = load_cui_vectors_bq(BQ_CUI_TABLE)

def gemini_embed(texts: List[str], *, task_type: str = "RETRIEVAL_DOCUMENT") -> np.ndarray:
    cfg = EmbedContentConfig(task_type=task_type)
    resp = genai_client.models.embed_content(
        model="gemini-embedding-001",
        contents=texts,
        config=cfg,
    )
    return np.array([e.values for e in resp.embeddings], dtype=np.float32)

def umls_link(text: str) -> List[Dict[str, Any]]:
    # Stub: implement your own UMLS linking here
    return []

def parse_docai_to_pyramid(docai: Dict):
    pages = docai.get("document", {}).get("pages", [])
    sections: List[Section] = []
    entities: List[Entity] = []
    headers_by_page: Dict[int, List[str]] = {}

    for p_idx, page in enumerate(pages):
        headers_by_page[p_idx] = []
        for ff in page.get("formFields", []) or []:
            name = ff.get("fieldName", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldName", {}).get("content", "")
            value = ff.get("fieldValue", {}).get("textAnchor", {}).get("content", "") or ff.get("fieldValue", {}).get("content", "")
            kv_text = f"{name.strip()}: {value.strip()}".strip(": ")
            entities.append(Entity(
                entity_id=f"kv:{p_idx}:{len(entities)}",
                page=p_idx, text=kv_text, kind="kv",
                section_title=None, value_norm_date=norm_date(kv_text)
            ))
            if DATE_REGEX.search(kv_text or ""):
                entities.append(Entity(
                    entity_id=f"date:{p_idx}:{len(entities)}",
                    page=p_idx, text=kv_text, kind="date",
                    section_title=None, value_norm_date=norm_date(kv_text)
                ))
        current_title = None
        buf: List[str] = []
        def flush():
            nonlocal buf, current_title, sections
            if current_title and buf:
                text = " ".join(buf)[:1000]
                sections.append(Section(
                    section_id=f"sec:{len(sections)}",
                    page=p_idx, title=current_title.upper(),
                    text_snippet=text
                ))
                buf = []
        for ln in page.get("lines", []) or []:
            t = ln.get("layout", {}).get("textAnchor", {}).get("content", "") or ln.get("layout", {}).get("content", "")
            if not t:
                continue
            if t.endswith(":") or (t.isupper() and len(t) <= 40):
                flush()
                current_title = t.strip().strip(":")
                headers_by_page[p_idx].append(current_title)
            else:
                if current_title:
                    buf.append(t)
                    for m in DATE_REGEX.finditer(t or ""):
                        entities.append(Entity(
                            entity_id=f"datei:{p_idx}:{len(entities)}",
                            page=p_idx, text=t, kind="date",
                            section_title=current_title.upper(),
                            value_norm_date=norm_date(m.group(0))
                        ))
        flush()

    doc_headers = list(dict.fromkeys([h for lst in headers_by_page.values() for h in lst]))
    doc_ctx = "[DOC] " + " | ".join(doc_headers)[:800]
    page_ctx = {p: "[PAGE] " + " | ".join(headers_by_page.get(p, []))[:400] for p in headers_by_page}
    return sections, entities, doc_ctx, page_ctx

def attach_cuis_and_context_embeddings(sections: List[Section], entities: List[Entity],
                                       doc_ctx: str, page_ctx: Dict[int, str]) -> None:
    texts, idx = [], []

    for s in sections:
        cand = umls_link(f"{s.title}: {s.text_snippet}")
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            s.cui = best["cui"]
        if s.cui and s.cui in CUI_VECTORS:
            s.emb = CUI_VECTORS[s.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(s.page,'')}\n[SECTION] {s.title}\n[TEXT] {s.text_snippet[:600]}"
            texts.append(t); idx.append(("section", s))

    for e in entities:
        cand = umls_link(e.text)
        if cand:
            best = max(cand, key=lambda x: x.get("score", 0.0))
            e.cui = best["cui"]
        if e.cui and e.cui in CUI_VECTORS:
            e.emb = CUI_VECTORS[e.cui]
        else:
            t = f"{doc_ctx}\n{page_ctx.get(e.page,'')}\n[SECTION] {e.section_title or 'NA'}\n[ENTITY:{e.kind}] {e.text[:400]}"
            texts.append(t); idx.append(("entity", e))

    if texts:
        embs = gemini_embed(texts)
        for (kind, obj), vec in zip(idx, embs):
            obj.emb = vec

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    da = float(np.linalg.norm(a) + 1e-8); db = float(np.linalg.norm(b) + 1e-8)
    return float(np.dot(a, b) / (da * db))

def build_similarity_graph(sections: List[Section]) -> Dict[Tuple[str, str], float]:
    sec_with_emb = [s for s in sections if s.emb is not None]
    S = {}
    threshold = 0.75
    for i in range(len(sec_with_emb)):
        for j in range(i + 1, len(sec_with_emb)):
            a, b = sec_with_emb[i], sec_with_emb[j]
            sim = cosine(a.emb, b.emb)
            if sim > threshold:
                S[(a.section_id, b.section_id)] = sim
                S[(b.section_id, a.section_id)] = sim
    return S

def query_similar_sections(query: str, sections: List[Section], top_k=5) -> List[Tuple[Section, float]]:
    q_emb = gemini_embed([query])[0]
    scored = []
    for s in sections:
        if s.emb is not None:
            sim = cosine(q_emb, s.emb)
            scored.append((s, sim))
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_k]

def generate_answer_llm(query: str, relevant_sections: List[Section]) -> str:
    context_text = "\n---\n".join(f"[{s.title}]\n{s.text_snippet}" for s in relevant_sections)
    prompt = f"Using the following context from the document, answer the query:\n\nContext:\n{context_text}\n\nQuery:\n{query}\n\nAnswer:"
    resp = genai_client.predict(
        model="gemini-002",
        parameters=TextGenerationConfig(max_tokens=256, temperature=0.0),
        prompt=prompt
    )
    return resp.text

# ----------------------------
# MAIN RUN
# ----------------------------

def main():
    # 1. Extract with DocAI
    print("Running DocAI extraction...")
    docai_json = run_docai_extraction()

    # 2. Parse to sections/entities + context
    print("Parsing extracted document...")
    sections, entities, doc_ctx, page_ctx = parse_docai_to_pyramid(docai_json)

    # 3. Attach embeddings + CUI
    print("Attaching embeddings and CUIs...")
    attach_cuis_and_context_embeddings(sections, entities, doc_ctx, page_ctx)

    # 4. Build similarity graph (optional)
    print("Building similarity graph...")
    sim_graph = build_similarity_graph(sections)

    # 5. Query and generate answers example
    query = "What is the patient's diagnosis?"
    print(f"Querying: {query}")
    relevant = query_similar_sections(query, sections, top_k=5)
    answer = generate_answer_llm(query, [s for s, _ in relevant])

    print("Answer:", answer)

if __name__ == "__main__":
    main()
