In [1]:
import os
import json
import logging


import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from google.genai import Client

from src import prompts
from src import utils
from src.batch import run_iterative_batch, parse_result_google_text
from src.builders import make_build_tag_extractor_requests

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
load_dotenv();

In [4]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")

In [5]:
connection_string = (
    "postgresql://{user}:{password}@{host}:{port}/{db}".format(
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        db=os.getenv("DB_NAME"),
    )
)
engine = create_engine(connection_string)

In [6]:
sites = utils.get_sites(engine)
client = Client(api_key=os.getenv("GOOGLE_API_KEY"))

In [7]:
build_requests = make_build_tag_extractor_requests(
    sites_df=sites,
    overview_jsonl_path="overview_researcher_batch_job-1_final_results.jsonl",
    access_jsonl_path="access_researcher_batch_job-1_final_results.jsonl",
    risk_jsonl_path="risk_researcher_batch_job-2_final_results.jsonl",
    prompt_template=prompts.tag_extractor_instructions,
)

In [8]:
# Iteratively retry via generalized batch module
JOB_NAME = "tag_extractor_batch_job-1"

all_site_ids = list(sites["site_id"].astype(int).values)

# def build_requests_site_specific(site_ids: list[int]) -> list[dict]:
#     return build_requests_for_site_ids(site_ids, sites, prompts.access_researcher_instructions)

resolved, unresolved = run_iterative_batch(
    client=client,
    items=all_site_ids,
    build_requests_fn=build_requests,
    parse_result_fn=parse_result_google_text,
    job_name_prefix=JOB_NAME,
    max_retries=10,
    poll_interval_sec=30,
)

print(f"Resolved: {len(resolved)}  Unresolved: {len(unresolved)}")

# Persist results keyed by site_id
ok_out = f"{JOB_NAME}_final_results.jsonl"
with open(ok_out, "w", encoding="utf-8") as f:
    for sid in sorted(resolved.keys()):
        f.write(json.dumps({"site_id": int(sid), "text": resolved[sid]}, ensure_ascii=False) + "\n")

# Persist unresolved site_ids for inspection/resubmission
failed_out = f"{JOB_NAME}_unresolved_site_ids.json"
with open(failed_out, "w") as f:
    json.dump(sorted([int(x) for x in unresolved]), f)

2025-08-26 20:39:54,266 INFO src.batch: Attempt 1: processing 248 items
2025-08-26 20:39:54,329 INFO src.batch: Submitting batch 'tag_extractor_batch_job-1_attempt_1-1756233594' with 248 requests
2025-08-26 20:39:55,029 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files "HTTP/1.1 200 OK"
2025-08-26 20:39:56,993 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/upload/v1beta/files?upload_id=ABgVH88Rdj3NHth0zRYLQH_GzsAgEYpvovefpZGxgWQFOprcrvqWBSdM10PzWiz09C6n3aelFbkbxIBznaDZO18ZbUaP5uVOu4X9g7WByfCmdPo&upload_protocol=resumable "HTTP/1.1 200 OK"
2025-08-26 20:39:59,774 INFO httpx: HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:batchGenerateContent "HTTP/1.1 200 OK"
2025-08-26 20:39:59,777 INFO src.batch: Created batch job 'batches/i57la6clh015zrnebls2k4ikuly6cbqbjqbt' (model=gemini-2.5-pro)
2025-08-26 20:39:59,886 INFO httpx: HTTP Request: GET https://generativelanguage.googleapis.c

Resolved: 248  Unresolved: 0
