In [None]:
#using SCANN

In [None]:
#Install dependency
!pip install google-cloud-documentai sentence-transformers scann numpy

In [None]:
#including the paprargah,form fields,tables and entities:
import re
import json
from google.cloud import documentai_v1 as documentai
from sentence_transformers import SentenceTransformer
import numpy as np
import scann

# === CONFIGURATION ===
project_id = "your-gcp-project-id"
location = "us"  # or 'eu'
processor_id = "your-processor-id"
gcs_input_uri = "gs://your-bucket/your-file.pdf"
output_json_path = "extracted_document.json"

# === SETUP CLIENT ===
client = documentai.DocumentProcessorServiceClient()
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# === HELPER FUNCTION: get text from text_anchor ===
def get_text(text_anchor):
    if not text_anchor or not text_anchor.text_segments:
        return ""
    text = ""
    for segment in text_anchor.text_segments:
        start = segment.start_index if segment.start_index else 0
        end = segment.end_index if segment.end_index else 0
        text += document.text[start:end]
    return text.strip()

# === HELPER FUNCTION: auto detect headers and extract sections ===
def auto_extract_sections(page_text):
    lines = page_text.splitlines()
    sections = {}
    current_header = None
    current_content_lines = []

    def save_section():
        if current_header:
            content = "\n".join(current_content_lines).strip()
            sections[current_header] = content

    header_pattern = re.compile(r".+:$")  # line ends with colon

    for line in lines:
        stripped_line = line.strip()
        is_header = False
        if header_pattern.match(stripped_line):
            is_header = True
            header_text = stripped_line[:-1].strip()  # remove colon
        else:
            words = stripped_line.split()
            if 0 < len(words) <= 5 and all(word[0].isupper() for word in words):
                is_header = True
                header_text = stripped_line

        if is_header:
            save_section()
            current_header = header_text
            current_content_lines = []
        else:
            if current_header:
                current_content_lines.append(stripped_line)
    save_section()
    return sections

# === CONFIGURE REQUEST ===
gcs_document = documentai.GcsDocument(gcs_uri=gcs_input_uri, mime_type="application/pdf")
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=documentai.GcsDocuments(documents=[gcs_document]))

request = documentai.ProcessRequest(
    name=name,
    input_documents=input_config
)

# === PROCESS DOCUMENT ===
result = client.process_document(request=request)
document = result.document

# === BUILD OUTPUT DATA ===
output_data = {
    "text": document.text,
    "pages": []
}

for page in document.pages:
    page_text = get_text(page.layout.text_anchor)
    page_data = {
        "page_number": page.page_number,
        "form_fields": [],
        "tables": [],
        "entities": [],
        "text_snippet": page_text,
        "sections": auto_extract_sections(page_text)
    }

    # Form fields
    for field in page.form_fields:
        field_name = get_text(field.field_name.text_anchor)
        field_value = get_text(field.field_value.text_anchor)
        page_data["form_fields"].append({
            "field_name": field_name,
            "field_value": field_value
        })

    # Tables
    for table in page.tables:
        table_data = []
        for row in table.header_rows + table.body_rows:
            row_data = [get_text(cell.layout.text_anchor) for cell in row.cells]
            table_data.append(row_data)
        page_data["tables"].append(table_data)

    # Entities (if available)
    for entity in getattr(document, "entities", []):
        entity_text = get_text(entity.text_anchor)
        entity_type = entity.type_ if hasattr(entity, "type_") else "UNKNOWN"
        page_data["entities"].append({
            "entity_type": entity_type,
            "text": entity_text
        })

    output_data["pages"].append(page_data)

# === SAVE TO JSON FILE ===
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=4, ensure_ascii=False)

print(f"Extracted data saved to: {output_json_path}")

# === SEMANTIC SEARCH SETUP ===
model = SentenceTransformer('all-MiniLM-L6-v2')

corpus = []
texts = []

for page in output_data.get("pages", []):
    # Sections (header + content)
    for header, content in page.get("sections", {}).items():
        combined_text = f"{header}: {content}"
        corpus.append({
            "type": "section",
            "page": page["page_number"],
            "header": header,
            "content": content
        })
        texts.append(combined_text)

    # Form fields
    for field in page.get("form_fields", []):
        combined_text = f"{field['field_name']}: {field['field_value']}"
        corpus.append({
            "type": "form_field",
            "page": page["page_number"],
            "field_name": field["field_name"],
            "field_value": field["field_value"]
        })
        texts.append(combined_text)

    # Tables (each row as a searchable text chunk)
    for table_idx, table in enumerate(page.get("tables", [])):
        for row_idx, row in enumerate(table):
            row_text = " | ".join(row)
            corpus.append({
                "type": "table_row",
                "page": page["page_number"],
                "table_index": table_idx,
                "row_index": row_idx,
                "text": row_text
            })
            texts.append(row_text)

    # Entities
    for entity in page.get("entities", []):
        entity_text = entity["text"]
        entity_type = entity["entity_type"]
        combined_text = f"{entity_type}: {entity_text}"
        corpus.append({
            "type": "entity",
            "page": page["page_number"],
            "entity_type": entity_type,
            "text": entity_text
        })
        texts.append(combined_text)

# Generate embeddings for entire corpus
corpus_embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

# Build SCANN index
searcher = scann.scann_ops_pybind.builder(corpus_embeddings, 10, "dot_product").tree(
    num_leaves=200, num_leaves_to_search=20, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

# === SEARCH FUNCTION ===
def semantic_search_scann(query, top_k=5):
    query_embedding = model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
    neighbors, distances = searcher.search(query_embedding, final_num_neighbors=top_k)

    results = []
    for idx, dist in zip(neighbors, distances):
        entry = corpus[idx]
        results.append({
            "score": float(dist),
            "metadata": entry
        })
    return results

# === INTERACTIVE SEARCH ===
while True:
    search_term = input("\nEnter your search query (or type 'exit' to quit): ").strip()
    if search_term.lower() == "exit":
        print("Exiting search.")
        break
    if not search_term:
        print("Please enter a non-empty query.")
        continue

    print(f"\nSemantic search results for query: '{search_term}':\n")

    results = semantic_search_scann(search_term, top_k=5)
    if not results:
        print("No results found.\n")
        continue

    for res in results:
        md = res["metadata"]
        score = res["score"]
        if md["type"] == "section":
            print(f"[Page {md['page']}] Section Header: {md['header']} (Score: {score:.3f})")
            print(f"Content:\n{md['content']}\n")
        elif md["type"] == "form_field":
            print(f"[Page {md['page']}] Form Field: {md['field_name']} (Score: {score:.3f})")
            print(f"Value: {md['field_value']}\n")
        elif md["type"] == "table_row":
            print(f"[Page {md['page']}] Table Row (Table {md['table_index']} Row {md['row_index']}) (Score: {score:.3f})")
            print(f"Text: {md['text']}\n")
        elif md["type"] == "entity":
            print(f"[Page {md['page']}] Entity ({md['entity_type']}) (Score: {score:.3f})")
            print(f"Text: {md['text']}\n")


In [None]:
#Using FAISS instead of SCANN

In [None]:
#Install dependency
!pip install --upgrade google-cloud-documentai sentence-transformers faiss-cpu

In [None]:
import re
import json
import faiss
import numpy as np
from google.cloud import documentai_v1 as documentai
from sentence_transformers import SentenceTransformer

# === CONFIGURATION ===
project_id = "your-gcp-project-id"
location = "us"
processor_id = "your-processor-id"
gcs_input_uri = "gs://your-bucket/your-file.pdf"
output_json_path = "extracted_document.json"

client = documentai.DocumentProcessorServiceClient()
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

def get_text(text_anchor):
    if not text_anchor or not text_anchor.text_segments:
        return ""
    text = ""
    for segment in text_anchor.text_segments:
        start = segment.start_index or 0
        end = segment.end_index or 0
        text += document.text[start:end]
    return text.strip()

def auto_extract_sections(page_text):
    lines = page_text.splitlines()
    sections = {}
    current_header = None
    current_content_lines = []

    def save_section():
        if current_header:
            content = "\n".join(current_content_lines).strip()
            sections[current_header] = content

    header_pattern = re.compile(r".+:$")

    for line in lines:
        stripped_line = line.strip()
        is_header = False
        if header_pattern.match(stripped_line):
            is_header = True
            header_text = stripped_line[:-1].strip()
        else:
            words = stripped_line.split()
            if 0 < len(words) <= 5 and all(word[0].isupper() for word in words):
                is_header = True
                header_text = stripped_line

        if is_header:
            save_section()
            current_header = header_text
            current_content_lines = []
        else:
            if current_header:
                current_content_lines.append(stripped_line)
    save_section()
    return sections

# === PROCESS DOCUMENT ===
gcs_document = documentai.GcsDocument(gcs_uri=gcs_input_uri, mime_type="application/pdf")
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=documentai.GcsDocuments(documents=[gcs_document]))
request = documentai.ProcessRequest(name=name, input_documents=input_config)

result = client.process_document(request=request)
document = result.document

# === EXTRACT DATA ===
output_data = {"text": document.text, "pages": []}

for page in document.pages:
    page_text = get_text(page.layout.text_anchor)
    page_data = {
        "page_number": page.page_number,
        "form_fields": [],
        "tables": [],
        "entities": [],
        "text_snippet": page_text,
        "sections": auto_extract_sections(page_text),
    }

    for field in page.form_fields:
        field_name = get_text(field.field_name.text_anchor)
        field_value = get_text(field.field_value.text_anchor)
        page_data["form_fields"].append({"field_name": field_name, "field_value": field_value})

    for table in page.tables:
        table_data = []
        for row in table.header_rows + table.body_rows:
            row_data = [get_text(cell.layout.text_anchor) for cell in row.cells]
            table_data.append(row_data)
        page_data["tables"].append(table_data)

    for entity in getattr(document, "entities", []):
        entity_text = get_text(entity.text_anchor)
        entity_type = getattr(entity, "type_", "UNKNOWN")
        page_data["entities"].append({"entity_type": entity_type, "text": entity_text})

    output_data["pages"].append(page_data)

with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=4, ensure_ascii=False)

print(f"✅ Extracted data saved to: {output_json_path}")

# === SEMANTIC SEARCH SETUP (FAISS) ===
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus = []
texts = []

for page in output_data["pages"]:
    for header, content in page.get("sections", {}).items():
        combined_text = f"{header}: {content}"
        corpus.append({"type": "section", "page": page["page_number"], "header": header, "content": content})
        texts.append(combined_text)

    for field in page.get("form_fields", []):
        combined_text = f"{field['field_name']}: {field['field_value']}"
        corpus.append({
            "type": "form_field", "page": page["page_number"],
            "field_name": field["field_name"], "field_value": field["field_value"]
        })
        texts.append(combined_text)

    for table_idx, table in enumerate(page.get("tables", [])):
        for row_idx, row in enumerate(table):
            row_text = " | ".join(row)
            corpus.append({
                "type": "table_row", "page": page["page_number"],
                "table_index": table_idx, "row_index": row_idx, "text": row_text
            })
            texts.append(row_text)

    for entity in page.get("entities", []):
        entity_text = entity["text"]
        entity_type = entity["entity_type"]
        combined_text = f"{entity_type}: {entity_text}"
        corpus.append({
            "type": "entity", "page": page["page_number"],
            "entity_type": entity_type, "text": entity_text
        })
        texts.append(combined_text)

corpus_embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(corpus_embeddings)

# === SEMANTIC SEARCH FUNCTION ===
def semantic_search_faiss(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, indices = index.search(query_embedding, top_k)
    results = []
    for idx, score in zip(indices[0], scores[0]):
        results.append({"score": float(score), "metadata": corpus[idx]})
    return results

# === INTERACTIVE SEARCH ===
while True:
    query = input("\nEnter your search query (or type 'exit'): ").strip()
    if query.lower() == "exit":
        print("Exiting.")
        break
    if not query:
        print("Please enter a query.")
        continue

    results = semantic_search_faiss(query)
    print(f"\nSemantic search results for: '{query}'\n")
    for res in results:
        md = res["metadata"]
        score = res["score"]
        print(f"📄 Page {md['page']} | Type: {md['type']} | Score: {score:.3f}")
        if md["type"] == "section":
            print(f"Header: {md['header']}\n{md['content']}")
        elif md["type"] == "form_field":
            print(f"{md['field_name']}: {md['field_value']}")
        elif md["type"] == "table_row":
            print(f"Table Row: {md['text']}")
        elif md["type"] == "entity":
            print(f"Entity ({md['entity_type']}): {md['text']}")
        print("-" * 50)


In [None]:
#OCR based solution

In [None]:
# !pip install pytesseract pdf2image pillow sentence-transformers faiss-cpu
# !apt-get install -y poppler-utils

In [None]:
# import pytesseract
# from pdf2image import convert_from_path
# from PIL import Image
# import re
# import json
# from sentence_transformers import SentenceTransformer
# import numpy as np
# import faiss

# # === CONFIGURATION ===
# pdf_path = "test2.pdf"
# output_json_path = "ocr_extracted.json"
# # tesseract_cmd_path = None  # Example: "C:/Program Files/Tesseract-OCR/tesseract.exe"

# # if tesseract_cmd_path:
# #     pytesseract.pytesseract.tesseract_cmd = tesseract_cmd_path

# # === OCR + SECTION PARSING ===

# def extract_sections(text):
#     lines = text.split("\n")
#     sections = {}
#     current_header = None
#     buffer = []

#     def flush():
#         if current_header and buffer:
#             sections[current_header] = "\n".join(buffer).strip()

#     header_pattern = re.compile(r"^[A-Z][A-Za-z0-9\s]{0,40}:?$")

#     for line in lines:
#         line = line.strip()
#         if not line:
#             continue

#         if header_pattern.match(line) and len(line) < 50:
#             flush()
#             current_header = line.rstrip(":").strip()
#             buffer = []
#         else:
#             if current_header:
#                 buffer.append(line)

#     flush()
#     return sections

# def extract_form_fields(text):
#     form_fields = []
#     for line in text.splitlines():
#         if ":" in line and len(line) < 100:
#             parts = line.split(":", 1)
#             key = parts[0].strip()
#             val = parts[1].strip()
#             if key and val:
#                 form_fields.append({"field_name": key, "field_value": val})
#     return form_fields

# def extract_entities(text):
#     entities = []
#     date_pattern = r"\b(?:\d{1,2}[/-])?\d{1,2}[/-]\d{2,4}\b|\b\d{4}-\d{2}-\d{2}\b"
#     name_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b"

#     for match in re.findall(date_pattern, text):
#         entities.append({"entity_type": "Date", "text": match})

#     for match in re.findall(name_pattern, text):
#         entities.append({"entity_type": "Person", "text": match})

#     return entities

# def extract_table_lines(text):
#     tables = []
#     lines = [line.strip() for line in text.splitlines() if "|" in line and line.count("|") >= 2]
#     for line in lines:
#         cells = [cell.strip() for cell in line.split("|") if cell.strip()]
#         if cells:
#             tables.append(cells)
#     return tables

# # === OCR PDF ===
# pages = convert_from_path(pdf_path, dpi=300)
# output_data = {"pages": []}

# for i, image in enumerate(pages):
#     text = pytesseract.image_to_string(image, lang="eng")
#     sections = extract_sections(text)
#     form_fields = extract_form_fields(text)
#     entities = extract_entities(text)
#     tables = extract_table_lines(text)

#     output_data["pages"].append({
#         "page_number": i + 1,
#         "text": text,
#         "sections": sections,
#         "form_fields": form_fields,
#         "entities": entities,
#         "tables": tables
#     })

# # === SAVE TO JSON ===
# with open(output_json_path, "w", encoding="utf-8") as f:
#     json.dump(output_data, f, indent=2, ensure_ascii=False)

# print(f"✅ OCR and extraction complete. Saved to: {output_json_path}")

# # === SEMANTIC SEARCH INDEXING ===
# model = SentenceTransformer("all-MiniLM-L6-v2")

# corpus = []
# texts = []

# for page in output_data["pages"]:
#     # Sections
#     for header, content in page.get("sections", {}).items():
#         full = f"{header}: {content}"
#         corpus.append({
#             "type": "section",
#             "page": page["page_number"],
#             "header": header,
#             "content": content
#         })
#         texts.append(full)

#     # Form fields
#     for field in page.get("form_fields", []):
#         full = f"{field['field_name']}: {field['field_value']}"
#         corpus.append({
#             "type": "form_field",
#             "page": page["page_number"],
#             "field_name": field["field_name"],
#             "field_value": field["field_value"]
#         })
#         texts.append(full)

#     # Tables
#     for idx, row in enumerate(page.get("tables", [])):
#         row_text = " | ".join(row)
#         corpus.append({
#             "type": "table_row",
#             "page": page["page_number"],
#             "row_index": idx,
#             "text": row_text
#         })
#         texts.append(row_text)

#     # Entities
#     for entity in page.get("entities", []):
#         corpus.append({
#             "type": "entity",
#             "page": page["page_number"],
#             "entity_type": entity["entity_type"],
#             "text": entity["text"]
#         })
#         texts.append(entity["text"])

# # === BUILD FAISS INDEX ===
# embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
# embedding_dim = embeddings.shape[1]
# index = faiss.IndexFlatIP(embedding_dim)  # Using Inner Product for cosine similarity with normalized vectors
# index.add(embeddings)

# # === INTERACTIVE SEARCH ===
# while True:
#     query = input("\n🔍 Enter search query (or 'exit'): ").strip()
#     if query.lower() == "exit":
#         break
#     if not query:
#         print("Please enter a valid query.")
#         continue

#     query_embedding = model.encode(query, convert_to_numpy=True, normalize_embeddings=True).reshape(1, -1)
#     distances, neighbors = index.search(query_embedding, 5)  # Top 5 results

#     print(f"\n🔎 Top Results for: '{query}'\n")
#     for idx, dist in zip(neighbors[0], distances[0]):
#         entry = corpus[idx]
#         print(f"📄 Page {entry['page']} | Type: {entry['type']} | Score: {dist:.3f}")
#         if entry["type"] == "section":
#             print(f"Header: {entry['header']}\n{entry['content']}")
#         elif entry["type"] == "form_field":
#             print(f"{entry['field_name']}: {entry['field_value']}")
#         elif entry["type"] == "table_row":
#             print(f"Table Row: {entry['text']}")
#         elif entry["type"] == "entity":
#             print(f"Entity ({entry['entity_type']}): {entry['text']}")
#         print("-" * 50)


In [None]:
# #expected json ouptut taken for search;
# {
#     "text": "Full raw text from the PDF document...",
#     "pages": [
#         {
#             "page_number": 1,
#             "form_fields": [
#                 {
#                     "field_name": "Patient Name",
#                     "field_value": "John Doe"
#                 },
#                 {
#                     "field_name": "Admission Date",
#                     "field_value": "2023-08-20"
#                 }
#             ],
#             "tables": [
#                 [
#                     ["Test", "Result", "Normal Range"],
#                     ["Hemoglobin", "13.5", "13-17"]
#                 ]
#             ],
#             "text_snippet": "History:\nPatient admitted with chest pain.\nTreatment:\nAspirin given.\n...",
#             "sections": {
#                 "History": "Patient admitted with chest pain.",
#                 "Treatment": "Aspirin given.",
#                 "Medication": "None reported.",
#                 "Admission Date": "2023-08-20"
#             }
#         }
#     ]
# }

#search result:

# [
#   {
#     "page": 1,
#     "section_header": "Treatment",
#     "section_text": "Aspirin given."
#   }
# ]
