In [1]:
import os
import json
import logging


import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from google.genai import Client

from src import prompts
from src import utils
from src.batch import run_iterative_batch, parse_result_google_text
from src.builders import make_build_tag_extractor_requests, build_requests_for_site_ids

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
load_dotenv();

In [4]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")

In [5]:
connection_string = (
    "postgresql://{user}:{password}@{host}:{port}/{db}".format(
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        db=os.getenv("DB_NAME"),
    )
)
engine = create_engine(connection_string)

In [6]:
sites = utils.get_sites(engine)
client = Client(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
build_requests = make_build_tag_extractor_requests(
    sites_df=sites,
    overview_jsonl_path="batch_results/overview_full_spec_final_reuslt.jsonl",
    access_jsonl_path="batch_results/access_full_spec_final_results.jsonl",
    risk_jsonl_path="batch_results/risk_full_spec_final_results.jsonl",
    prompt_template=prompts.tag_extractor_instructions,
    engine=engine,
)

# def build_requests(site_ids: list[int]) -> list[dict]:
#     return build_requests_for_site_ids(site_ids, sites, prompts.risk_researcher_instructions, engine)

In [8]:
# Iteratively retry via generalized batch module
JOB_NAME = "tags_full_spec"

all_site_ids = list(sites["site_id"].astype(int).values)

resolved, unresolved = run_iterative_batch(
    client=client,
    items=all_site_ids,
    build_requests_fn=build_requests,
    parse_result_fn=parse_result_google_text,
    job_name_prefix=JOB_NAME,
    max_retries=10,
    poll_interval_sec=30,
)

print(f"Resolved: {len(resolved)}  Unresolved: {len(unresolved)}")

# Ensure batch_results directory exists
import os
os.makedirs("../batch_results", exist_ok=True)

# Persist results keyed by site_id
ok_out = f"../batch_results/{JOB_NAME}_final_results.jsonl"
with open(ok_out, "w", encoding="utf-8") as f:
    for sid in sorted(resolved.keys()):
        f.write(json.dumps({"site_id": int(sid), "text": resolved[sid]}, ensure_ascii=False) + "\n")

# Persist unresolved site_ids for inspection/resubmission
failed_out = f"../batch_results/{JOB_NAME}_unresolved_site_ids.json"
with open(failed_out, "w") as f:
    json.dump(sorted([int(x) for x in unresolved]), f)

2025-08-31 20:29:27,057 INFO src.batch: Attempt 1: processing 248 items
2025-08-31 20:29:31,000 INFO src.batch: Submitting batch 'tags_full_spec_attempt_1-1756664967' with 248 requests
2025-08-31 20:29:31,846 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
2025-08-31 20:29:34,215 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH89Me6EWrjuuhLHrvzDTwCwHru6YUNtX-RXMmzjTZH9aWQNqyI8zXtG_s8vkZz6g7tba6TKhHvzIdBOpjKq3i60HSpTf-6jxWcWzAKihbt8&upload_protocol=resumable "HTTP/1.1 200 OK"
2025-08-31 20:29:36,576 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:batchGenerateContent "HTTP/1.1 200 OK"
2025-08-31 20:29:36,581 INFO src.batch: Created batch job 'batches/u3gh6c1a7q6rw6unged3loj2e6fvd9gvqooe' (model=gemini-2.5-pro)
2025-08-31 20:29:36,744 INFO httpx: HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/b

Resolved: 248  Unresolved: 0
