In [None]:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
import json
import fitz  # PyMuPDF
from PIL import Image
import os

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


In [None]:
# Prepare chat message
image_path = r'C:\Users\lee_jayyang\Data_Projects\RCS\artifacts\ingestion\att_output\MC 16052025.jpg'

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": (
                "Extract the following fields from this document image:\n"
                "- patient_name\n"
                "- nric\n"
                "- visit_date\n"
                "- visit_number\n\n"
                "Return a valid JSON object with exactly these keys in snake_case.\n"
                "Do not include explanations, markdown formatting, or text outside the JSON. "
                "Do not use ```json or ``` wrappers.\n"
                "Only respond with raw JSON, like:\n"
                "Return blank if not sure, do not create synthetic data"
                "{\n"
                "  \"patient_name\": \"...\",\n"
                "  \"nric\": \"...\",\n"
                "  \"visit_date\": \"...\",\n"
                "  \"visit_number\": \"...\"\n"
                "}"
            )}
        ]
    }
]


In [None]:
# Process image + text
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")

# Generate output
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=512)

# Decode only the new generation part
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

# Try parsing to JSON
try:
    extracted = json.loads(output_text.strip())
except Exception:
    extracted = {"error": "Could not parse", "raw": output_text}

print(json.dumps(extracted, indent=2))


In [None]:
extracted

In [None]:
# Step 1: Convert PDF to images
def convert_pdf_to_images(pdf_path, output_dir="pdf_pages", dpi=144, resize_width=960):
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []

    for i in range(len(doc)):
        page = doc.load_page(i)
        pix = page.get_pixmap(dpi=dpi)
        img_path = os.path.join(output_dir, f"page_{i+1}.png")
        pix.save(img_path)

        # Resize to prevent memory blowup
        image = Image.open(img_path)
        if image.width > resize_width:
            aspect_ratio = image.height / image.width
            new_height = int(resize_width * aspect_ratio)
            image = image.resize((resize_width, new_height))
            image.save(img_path)

        image_paths.append(img_path)

    return image_paths

# Step 2: Extract fields from one image
def extract_fields_from_image(image_path, fields):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert insurance claim processor specializing in reading scanned claim forms "
                "and extracting structured data for digital processing. You understand insurance-specific "
                "terminology such as 'ward class', 'MC serial number', 'consultation', and 'ineligible amount'."
            )
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": (
                    "Extract the following fields from this insurance claim document image:\n"
                    + "\n".join(f"- {f}" for f in fields) +
                    "\n\nRespond with only a valid JSON object.\n"
                    "Do not include any explanation, markdown, or comments. Use snake_case keys exactly as listed above.\n"
                    "Leave missing fields as empty strings (\"\") or null values.\n"
                    "Output example:\n"
                    "{\n  \"patient_name\": \"...\",\n  \"visit_date\": \"...\",\n  ...\n}"
                )}
            ]
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to("cpu")

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=1024)

    trimmed_ids = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
    output = processor.batch_decode(trimmed_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    try:
        return json.loads(output.strip())
    except Exception:
        return {"error": "parse_failed", "raw": output.strip()}

# Step 3: Extract from full PDF
def extract_from_pdf(pdf_path, field_list):
    pages = convert_pdf_to_images(pdf_path)
    full_result = {}

    for i, image_path in enumerate(pages, start=1):
        print(f"🔍 Processing page {i}...")
        result = extract_fields_from_image(image_path, field_list)
        full_result[f"page_{i}"] = result
        os.remove(image_path)

    output_path = os.path.splitext(pdf_path)[0] + "_extracted.json"
    with open(output_path, "w") as f:
        json.dump(full_result, f, indent=2)

    print(f"\n✅ Extraction complete. Output saved to:\n{output_path}")
    return full_result

# Define field list
fields = [
    "patient_name", "patient_nric", "clinic_name", "service_type", "visit_type", "visit_date",
    "mc_from_date", "mc_to_date", "mc", "mc_serial_number", "doctor", "diagnosis",
    "invoice_no", "invoice_date"
]

# Run POC
extract_from_pdf(
    r"C:\Users\lee_jayyang\Data_Projects\RCS\artifacts\ingestion\att_output\Group Outpatient Medical Claim Form (1 May 2025) - Yenny.pdf",
    fields
)


In [None]:
def merge_claim_fields(pages_dict):
    merged = {}

    for page_key in sorted(pages_dict.keys()):
        page = pages_dict[page_key]

        # If parse failed, try to recover the raw JSON
        if "raw" in page:
            try:
                raw = page["raw"]
                raw = raw.strip().removeprefix("```json").removesuffix("```").strip()
                page = json.loads(raw)
            except:
                continue  # skip page if even raw can't be parsed

        for k, v in page.items():
            if not merged.get(k) and v not in [None, "", "null"]:
                merged[k] = v  # take first non-empty value

    # Fill in empty strings for any missing fields
    all_fields = set().union(*[d.keys() for d in pages_dict.values() if isinstance(d, dict)])
    for field in all_fields:
        merged.setdefault(field, "")

    return merged


In [None]:
input_json_path = r'C:\Users\lee_jayyang\Data_Projects\RCS\artifacts\ingestion\att_output\Group Outpatient Medical Claim Form (1 May 2025) - Yenny_extracted.json'
with open(input_json_path, 'r', encoding='utf-8') as file:
    json_list = json.load(file)


In [None]:

# Merge into one flat claim
merged_claim = merge_claim_fields(json_list)


print("🧾 Final merged claim:")
print(json.dumps(merged_claim, indent=2))

