In [1]:
# ─── Imports ────────────────────────────────────────────────────────────────
import json, base64, os
from dotenv import load_dotenv
from pypdf import PdfReader
from google import genai
from google.genai import types

In [2]:
# ─── Setup ────────────────────────────────────────────────────────────────
MODEL = "gemini-2.5-flash-preview-04-17"

load_dotenv(dotenv_path=".env.local")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

client = genai.Client(api_key=GEMINI_API_KEY)

In [3]:
# ─── PDF Parse ────────────────────────────────────────────────────────────────
def extract_fields_with_coords(pdf_path):
    """
    This is will extra all fillable entries in the PDF. 
    """
    reader = PdfReader(pdf_path)
    fields = []
    for page_num, page in enumerate(reader.pages, start=1):
        for annot in page.get("/Annots", []):
            obj = annot.get_object()
            name = obj.get("/T")
            rect = obj.get("/Rect")
            if name and rect:
                x1, y1, x2, y2 = rect
                fields.append({
                    "field_id": name,
                    "page":     page_num,
                    "coords":   [x1, y1, x2, y2],
                })
    return fields

In [4]:
def group_widgets(fields: list[dict]) -> list[dict]:
    """
    Collapse widgets that share the same base name before the first '['.
    Example: "USCitizen_Y[0]" + "USCitizen_N[0]" → base "USCitizen".
    """
    groups = {}
    for f in fields:
        base = f["field_id"].split("[")[0]      # crude but works 90%+
        groups.setdefault(base, []).append(f)

    grouped = []
    for base_id, widgets in groups.items():
        if len(widgets) == 1:
            grouped.append(widgets[0])          # keep as-is
        else:
            # Create a synthetic "group" field entry
            grouped.append({
                "field_id": base_id,
                "widgets": widgets              # let Gemini transform
            })
    return grouped


In [5]:
from google import genai
import json, textwrap

# 1️⃣  System-prompt template ---------------------------------------------
QUESTION_SYSTEM_TMPL = textwrap.dedent("""
  You are an assistant that writes user-friendly questions for the {immigration_doc_id}
  Employment Authorization form.

  Target language: {lang}

  ## … rules omitted …

  ## Return value (minified JSON only)
  {{
    "questions": [
      {{
        "field_id": "base_id",
        "input_type": "text" | "radio_single" | "checkbox_multi",
        "options": [
          {{
            "option_id": "USCitizen_Y[0]",
            "label": "Yes",
            "coords": [260.161, 514.001, 270.161, 524.001]
          }},
          {{
            "option_id": "USCitizen_N[0]",
            "label": "No",
            "coords": [...]
          }}
        ],
        "question": "...",
        "explanation": "...",
        "cultural_notes": "...",
        "translation_notes": "...",
        "examples": [],
        "format_hint": "",
        "required": true
      }}
    ]
  }}
""").strip()

def match_fields(
    fields: list[dict],          # output from your match_fields()
    instructions: str,           # full text of the instructions PDF
    form_id:str, 
    target_lang: str = "English" # e.g. "Spanish", "Vietnamese") -> list[dict]:
) -> dict: 
    system_prompt = QUESTION_SYSTEM_TMPL.format(
        lang=target_lang, 
        immigration_doc_id=form_id
    )

    # Same payload pattern: field list JSON ➜ instructions text
    payload = json.dumps(fields, ensure_ascii=False) + "\n\n" + instructions

    resp = genai.Client().models.generate_content(
        model="gemini-2.5-flash-preview-04-17",
        contents=payload,
        config=types.GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0.2,                      # allow slight creativity
            response_mime_type="application/json",
            thinking_config=types.ThinkingConfig( # ⏱️ keep latency low
                thinking_budget=0
            ),
        ),
    )

    if resp.text is None:
        raise RuntimeError("Model returned no text; cannot parse JSON")

    return json.loads(resp.text) 


In [6]:
form_id = "I-765"

raw = extract_fields_with_coords(f"static/forms/{form_id}/form.pdf")

In [7]:
grouped = group_widgets(raw)

processed = match_fields(grouped, QUESTION_SYSTEM_TMPL, form_id)

In [9]:
import json, os, pathlib

cache_path = pathlib.Path(f"static/forms/{form_id}/cache.json")

with cache_path.open("w", encoding="utf-8") as f:
    json.dump(processed, f, indent=2, ensure_ascii=False)