# iTELL Volume Generation Pipeline - Testing Notebook

Test all three prompt strategies against your input PDF.

In [10]:
import sys
import os
import json
import re
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

# Force reload of the module to pick up changes
import importlib
if 'pipeline.gemini_client' in sys.modules:
    importlib.reload(sys.modules['pipeline.gemini_client'])

from pipeline.gemini_client import OpenAIClient, GeminiClient
from pipeline.extract_images import ExtractImages
from pipeline.utils import (
    build_conversion_prompt,
    encode_pdf_to_base64,
    format_image_metadata,
    load_guide_instructions,
    load_reference_json,
    select_reference_example,
)

project_root = parent_dir.parent

# Load environment variables with override=True to refresh any cached values
env_path = project_root / '.env'
load_dotenv(env_path, override=True)

print("✓ Imports successful")
print(f"✓ Working directory: {Path.cwd()}")
print(f"✓ Project root: {project_root}")
print(f"✓ .env file path: {env_path}")
print(f"✓ .env file exists: {env_path.exists()}")

✓ Imports successful
✓ Working directory: /Users/kalidindiadithya/Documents/itell-volume-generation/src/notebooks
✓ Project root: /Users/kalidindiadithya/Documents/itell-volume-generation
✓ .env file path: /Users/kalidindiadithya/Documents/itell-volume-generation/.env
✓ .env file exists: True


## Configuration

Set the PDF file path to test different input documents. Output files will be named: `itell_{PDF_NAME}_{STRATEGY}.json`

In [11]:
PDF_PATH = project_root / "data" / "input.pdf"
IMAGE_DIR = project_root / "data" / "extractedimages"

STRATEGY_FILES = {
    "strategy1_cot": project_root / "prompts/guide_strategy1_cot.md",
    "strategy2_fewshot": project_root / "prompts/guide_strategy2_fewshot.md",
    "strategy3_validation": project_root / "prompts/guide_strategy3_validation.md",
}

REFERENCE_JSON_PATH = project_root / "prompts/reference.json"
PDF_NAME = PDF_PATH.stem

# Set provider: "gemini" or "openrouter"
PROVIDER = "gemini"

if PROVIDER == "gemini":
    MODEL = os.getenv("GEMINI_MODEL") or "gemini-2.5-flash"
    MAX_TOKENS = 8192
elif PROVIDER == "openrouter":
    MODEL = os.getenv("OPENROUTER_MODEL") or "google/gemini-2.5-flash"
    MAX_TOKENS = 99999
else:
    MODEL = os.getenv("OPENAI_MODEL") or "gpt-4o"
    MAX_TOKENS = 16384

print(f"PDF: {PDF_PATH.name}")
print(f"Provider: {PROVIDER}")
print(f"Model: {MODEL}")

PDF: input.pdf
Provider: gemini
Model: gemini-2.5-flash


## Setup

Initialize all components needed for testing.

In [12]:
reference_json = load_reference_json(REFERENCE_JSON_PATH)
example_json = select_reference_example(reference_json, example_title=None)

extractor = ExtractImages(str(PDF_PATH), str(IMAGE_DIR))
image_metadata = extractor.extract_img()
extractor.save_metadata(str(IMAGE_DIR / "metadata.json"))
image_metadata_text = format_image_metadata(image_metadata)

pdf_b64 = encode_pdf_to_base64(PDF_PATH)

# Debug: Check if environment variables are loaded
print(f"Checking environment variables...")
print(f"GEMINI_API_KEY present: {bool(os.getenv('GEMINI_API_KEY'))}")
print(f"OPENROUTER_API_KEY present: {bool(os.getenv('OPENROUTER_API_KEY'))}")
print()

# Initialize the appropriate client based on provider
if PROVIDER == "gemini":
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise RuntimeError("Set GEMINI_API_KEY in .env file")
    
    client = GeminiClient(
        model=MODEL,
        api_key=api_key,
        max_output_tokens=MAX_TOKENS,
    )
    print(f"Using Gemini API")
    
elif PROVIDER == "openrouter":
    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        raise RuntimeError("Set OPENROUTER_API_KEY in .env file")
    
    base_url = os.getenv("OPENROUTER_BASE_URL") or "https://openrouter.ai/api/v1"
    default_headers = {}
    if referer := os.getenv("OPENROUTER_SITE_URL"):
        default_headers["HTTP-Referer"] = referer
    if app_name := os.getenv("OPENROUTER_APP_NAME"):
        default_headers["X-Title"] = app_name
    
    client = OpenAIClient(
        model=MODEL,
        api_key=api_key,
        base_url=base_url,
        max_completion_tokens=MAX_TOKENS,
        default_headers=default_headers,
    )
    print(f"Using OpenRouter API")
    
else:  # openai
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("Set OPENAI_API_KEY in .env file")
    
    base_url = os.getenv("OPENAI_BASE_URL") or None
    
    client = OpenAIClient(
        model=MODEL,
        api_key=api_key,
        base_url=base_url,
        max_completion_tokens=MAX_TOKENS,
    )
    print(f"Using OpenAI API")

print(f"Extracted {len(image_metadata)} images")
print(f"Encoded PDF ({len(pdf_b64)} characters)")
print(f"Initialized client")

Page 1: 1 image(s)
Page 2: 1 image(s)
Page 3: 1 image(s)
Checking environment variables...
GEMINI_API_KEY present: True
OPENROUTER_API_KEY present: True

Using Gemini API
Extracted 3 images
Encoded PDF (1732788 characters)
Initialized client


## Test All Strategies

Tests all three prompt strategies and generates output files:
- `itell_{PDF_NAME}_strategy1_cot.json`
- `itell_{PDF_NAME}_strategy2_fewshot.json`
- `itell_{PDF_NAME}_strategy3_validation.json`

In [None]:
from pipeline.models import NewVolume

strategies_to_test = ["strategy1_cot", "strategy2_fewshot", "strategy3_validation"]
results_summary = []

for strategy in strategies_to_test:
    print(f"Testing {strategy}...") 
    guide_path = STRATEGY_FILES[strategy]
    guide_text = load_guide_instructions(guide_path)
    prompt = build_conversion_prompt(guide_text, example_json, image_metadata_text=image_metadata_text)
    
    output_filename = f"itell_{PDF_NAME}_{strategy}.json"
    output_path = project_root / "results" / output_filename
    
    print(f"Strategy: {strategy}")
    print(f"Guide: {guide_path.name}")
    print(f"Output: {output_filename}")
    print("\nCalling LLM with structured output...")
    
    try:
        # Use structured output with Pydantic validation
        result = client.generate_itell_structured(
            pdf_filename=PDF_PATH.name,
            pdf_base64=pdf_b64,
            prompt=prompt,
            response_format=NewVolume
        )
        
        # Convert Pydantic model to JSON with proper aliasing
        result_json = result.model_dump_json(indent=2, by_alias=True)
        
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(result_json, encoding="utf-8")
        
        # Get stats from the Pydantic model
        page_count = len(result.Pages)
        chunk_count = sum(len(page.Content) for page in result.Pages)
        status = "Valid structured output"
        
        image_count = len(re.findall(r'!\[.*?\]\(image_page_\d+_\d+\)', result_json))
        
        results_summary.append({
            "strategy": strategy,
            "output_file": output_filename,
            "status": status,
            "pages": page_count,
            "chunks": chunk_count,
            "images": image_count
        })
        
        print(f"✓ Saved to {output_path}")
        print(f"{status}")
        print(f"Pages: {page_count}, Chunks: {chunk_count}, Images: {image_count}\n")
        
    except Exception as e:
        print(f"Error: {str(e)}\n")
        import traceback
        traceback.print_exc()
        results_summary.append({
            "strategy": strategy,
            "output_file": output_filename,
            "status": f"Error: {str(e)[:50]}...",
            "pages": "N/A",
            "chunks": "N/A",
            "images": "N/A"
        })

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"{'Strategy':<25} {'Status':<30} {'Pages':<8} {'Chunks':<8} {'Images':<8}")
print("-"*80)
for result in results_summary:
    print(f"{result['strategy']:<25} {result['status']:<30} {str(result['pages']):<8} {str(result['chunks']):<8} {str(result['images']):<8}")
print("="*80)
print(f"\nResults saved to: {project_root / 'results'}")

Testing strategy1_cot...
Strategy: strategy1_cot
Guide: guide_strategy1_cot.md
Output: itell_input_strategy1_cot.json

Calling LLM with structured output...
✗ Error: Unknown field for Schema: $defs

Testing strategy2_fewshot...
Strategy: strategy2_fewshot
Guide: guide_strategy2_fewshot.md
Output: itell_input_strategy2_fewshot.json

Calling LLM with structured output...
✗ Error: Unknown field for Schema: $defs

Testing strategy3_validation...
Strategy: strategy3_validation
Guide: guide_strategy3_validation.md
Output: itell_input_strategy3_validation.json

Calling LLM with structured output...
✗ Error: Unknown field for Schema: $defs


SUMMARY
Strategy                  Status                         Pages    Chunks   Images  
--------------------------------------------------------------------------------
strategy1_cot             ✗ Error: Unknown field for Schema: $defs... N/A      N/A      N/A     
strategy2_fewshot         ✗ Error: Unknown field for Schema: $defs... N/A      N/A      

Traceback (most recent call last):
  File "/var/folders/1w/f5btrd5x25g23jklshg2g2br0000gn/T/ipykernel_94137/2636169882.py", line 22, in <module>
    result = client.generate_itell_structured(
        pdf_filename=PDF_PATH.name,
    ...<2 lines>...
        response_format=NewVolume
    )
  File "/Users/kalidindiadithya/Documents/itell-volume-generation/src/pipeline/gemini_client.py", line 248, in generate_itell_structured
    response = self.model.generate_content(
        [pdf_part, prompt],
    ...<3 lines>...
        },
    )
  File "/Users/kalidindiadithya/Documents/itell-volume-generation/venv/lib/python3.13/site-packages/google/generativeai/generative_models.py", line 305, in generate_content
    request = self._prepare_request(
        contents=contents,
    ...<3 lines>...
        tool_config=tool_config,
    )
  File "/Users/kalidindiadithya/Documents/itell-volume-generation/venv/lib/python3.13/site-packages/google/generativeai/generative_models.py", line 156, in _prepare_reque