# iTELL Volume Generation Pipeline - Testing Notebook

Test all three prompt strategies against your input PDF.

In [14]:
import sys
import os
import json
import re
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from pipeline.gemini_client import OpenAIClient
from pipeline.extract_images import ExtractImages
from pipeline.utils import (
    build_conversion_prompt,
    encode_pdf_to_base64,
    format_image_metadata,
    load_guide_instructions,
    load_reference_json,
    select_reference_example,
)

project_root = parent_dir.parent
load_dotenv(project_root / '.env')

print("✓ Imports successful")
print(f"✓ Working directory: {Path.cwd()}")
print(f"✓ Project root: {project_root}")

✓ Imports successful
✓ Working directory: /Users/kalidindiadithya/Documents/itell-volume-generation/src/notebooks
✓ Project root: /Users/kalidindiadithya/Documents/itell-volume-generation


## Configuration

Set the PDF file path to test different input documents. Output files will be named: `itell_{PDF_NAME}_{STRATEGY}.json`

In [15]:
PDF_PATH = project_root / "src/resources/input.pdf"
IMAGE_DIR = project_root / "extractedimages"

STRATEGY_FILES = {
    "strategy1_cot": project_root / "prompts/guide_strategy1_cot.md",
    "strategy2_fewshot": project_root / "prompts/guide_strategy2_fewshot.md",
    "strategy3_validation": project_root / "prompts/guide_strategy3_validation.md",
}

REFERENCE_JSON_PATH = project_root / "prompts/reference.json"
PDF_NAME = PDF_PATH.stem
MODEL = os.getenv("OPENROUTER_MODEL") or os.getenv("OPENAI_MODEL") or "google/gemini-2.5-flash"
MAX_TOKENS = 4000

print(f"PDF: {PDF_PATH.name}")
print(f"Model: {MODEL}")

PDF: input.pdf
Model: google/gemini-2.5-flash


## Setup

Initialize all components needed for testing.

In [16]:
reference_json = load_reference_json(REFERENCE_JSON_PATH)
example_json = select_reference_example(reference_json, example_title=None)

extractor = ExtractImages(str(PDF_PATH), str(IMAGE_DIR))
image_metadata = extractor.extract_img(str(PDF_PATH))
extractor.save_metadata(str(IMAGE_DIR / "metadata.json"))
image_metadata_text = format_image_metadata(image_metadata)

pdf_b64 = encode_pdf_to_base64(PDF_PATH)

openai_key = os.getenv("OPENAI_API_KEY")
openrouter_key = os.getenv("OPENROUTER_API_KEY")
api_key = openai_key or openrouter_key
if not api_key:
    raise RuntimeError("Set OPENAI_API_KEY or OPENROUTER_API_KEY in .env file")

using_openrouter = bool(openrouter_key and not openai_key)

if using_openrouter:
    base_url = os.getenv("OPENROUTER_BASE_URL") or "https://openrouter.ai/api/v1"
    default_headers = {}
    if referer := os.getenv("OPENROUTER_SITE_URL"):
        default_headers["HTTP-Referer"] = referer
    if app_name := os.getenv("OPENROUTER_APP_NAME"):
        default_headers["X-Title"] = app_name
else:
    base_url = os.getenv("OPENAI_BASE_URL") or None
    default_headers = None

client = OpenAIClient(
    model=MODEL,
    api_key=api_key,
    base_url=base_url,
    max_completion_tokens=MAX_TOKENS,
    default_headers=default_headers,
)

print(f"Extracted {len(image_metadata)} images")
print(f"Encoded PDF ({len(pdf_b64)} characters)")
print(f"Initialized client")

Page 1: 1 image(s)
Page 2: 1 image(s)
Page 3: 1 image(s)
Extracted 3 images
Encoded PDF (1732788 characters)
Initialized client


## Test All Strategies

Tests all three prompt strategies and generates output files:
- `itell_{PDF_NAME}_strategy1_cot.json`
- `itell_{PDF_NAME}_strategy2_fewshot.json`
- `itell_{PDF_NAME}_strategy3_validation.json`

In [17]:
strategies_to_test = ["strategy1_cot", "strategy2_fewshot", "strategy3_validation"]
results_summary = []

for strategy in strategies_to_test:
    print(f"Testing {strategy}...") 
    guide_path = STRATEGY_FILES[strategy]
    guide_text = load_guide_instructions(guide_path)
    prompt = build_conversion_prompt(guide_text, example_json, image_metadata_text=image_metadata_text)
    
    output_filename = f"itell_{PDF_NAME}_{strategy}.json"
    output_path = project_root / "results" / output_filename
    
    print(f"Strategy: {strategy}")
    print(f"Guide: {guide_path.name}")
    print(f"Output: {output_filename}")
    print("\nCalling LLM...")
    
    try:
        result_json = client.generate_itell_json(
            pdf_filename=PDF_PATH.name,
            pdf_base64=pdf_b64,
            prompt=prompt
        )
        
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(result_json, encoding="utf-8")
        
        try:
            result_dict = json.loads(result_json)
            page_count = len(result_dict.get("Pages", []))
            chunk_count = sum(len(page.get("Content", [])) for page in result_dict.get("Pages", []))
            status = "Valid JSON"
        except json.JSONDecodeError:
            page_count = "N/A"
            chunk_count = "N/A"
            status = "JSON parsing error"
        
        image_count = len(re.findall(r'!\[.*?\]\(image_page_\d+_\d+\)', result_json))
        
        results_summary.append({
            "strategy": strategy,
            "output_file": output_filename,
            "status": status,
            "pages": page_count,
            "chunks": chunk_count,
            "images": image_count
        })
        
        print(f"✓ Saved to {output_path}")
        print(f"{status}")
        print(f"Pages: {page_count}, Chunks: {chunk_count}, Images: {image_count}\n")
        
    except Exception as e:
        print(f"✗ Error: {str(e)}\n")
        results_summary.append({
            "strategy": strategy,
            "output_file": output_filename,
            "status": f"✗ Error: {str(e)}",
            "pages": "N/A",
            "chunks": "N/A",
            "images": "N/A"
        })

print("SUMMARY")
print(f"{'Strategy':<25} {'Status':<20} {'Pages':<8} {'Chunks':<8} {'Images':<8}")
for result in results_summary:
    print(f"{result['strategy']:<25} {result['status']:<20} {str(result['pages']):<8} {str(result['chunks']):<8} {str(result['images']):<8}")

print(f"\nResults saved to: {project_root / 'results'}")

Testing strategy1_cot...
Strategy: strategy1_cot
Guide: guide_strategy1_cot.md
Output: itell_input_strategy1_cot.json

Calling LLM...
✓ Saved to /Users/kalidindiadithya/Documents/itell-volume-generation/results/itell_input_strategy1_cot.json
JSON parsing error
Pages: N/A, Chunks: N/A, Images: 6

Testing strategy2_fewshot...
Strategy: strategy2_fewshot
Guide: guide_strategy2_fewshot.md
Output: itell_input_strategy2_fewshot.json

Calling LLM...
✓ Saved to /Users/kalidindiadithya/Documents/itell-volume-generation/results/itell_input_strategy2_fewshot.json
JSON parsing error
Pages: N/A, Chunks: N/A, Images: 0

Testing strategy3_validation...
Strategy: strategy3_validation
Guide: guide_strategy3_validation.md
Output: itell_input_strategy3_validation.json

Calling LLM...
✓ Saved to /Users/kalidindiadithya/Documents/itell-volume-generation/results/itell_input_strategy3_validation.json
JSON parsing error
Pages: N/A, Chunks: N/A, Images: 0

SUMMARY
Strategy                  Status              