# iTELL Volume Generation Pipeline - Testing Notebook

This notebook provides an interactive interface to test the PDF to iTELL JSON conversion pipeline.

In [1]:
import sys
import os
import json
from pathlib import Path
from dotenv import load_dotenv

# Since notebooks/ and pipeline/ are siblings in src/, we can import directly
# But we need to add parent (src) to path first
parent_dir = Path.cwd().parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from pipeline.gemini_client import OpenAIClient
from pipeline.extract_images import ExtractImages
from pipeline.utils import (
    build_conversion_prompt,
    encode_pdf_to_base64,
    format_image_metadata,
    load_guide_instructions,
    load_reference_json,
    select_reference_example,
)

# Load environment variables from project root
project_root = parent_dir.parent
load_dotenv(project_root / '.env')

print("✓ Imports successful")
print(f"✓ Working directory: {Path.cwd()}")
print(f"✓ Project root: {project_root}")

✓ Imports successful
✓ Working directory: /Users/kalidindiadithya/Documents/itell-volume-generation/src/notebooks
✓ Project root: /Users/kalidindiadithya/Documents/itell-volume-generation


## Configuration

In [2]:
# Configure paths (relative to project root)
PDF_PATH = project_root / "src/resources/input.pdf"
GUIDE_PATH = project_root / "src/resources/guide.md"
REFERENCE_JSON_PATH = project_root / "src/resources/reference.json"
IMAGE_DIR = project_root / "extractedimages"
OUTPUT_PATH = project_root / "results/itell_notebook.json"

# Model configuration
MODEL = os.getenv("OPENROUTER_MODEL") or os.getenv("OPENAI_MODEL") or "google/gemini-2.5-flash"
MAX_TOKENS = 4000

print(f"PDF: {PDF_PATH}")
print(f"Guide: {GUIDE_PATH}")
print(f"Reference: {REFERENCE_JSON_PATH}")
print(f"Model: {MODEL}")

PDF: /Users/kalidindiadithya/Documents/itell-volume-generation/src/resources/input.pdf
Guide: /Users/kalidindiadithya/Documents/itell-volume-generation/src/resources/guide.md
Reference: /Users/kalidindiadithya/Documents/itell-volume-generation/src/resources/reference.json
Model: google/gemini-2.5-flash


## Step 1: Load Reference Materials

In [3]:
# Load reference JSON and guide instructions
reference_json = load_reference_json(REFERENCE_JSON_PATH)
guide_text = load_guide_instructions(GUIDE_PATH)
example_json = select_reference_example(reference_json, example_title=None)

print(f"✓ Loaded guide ({len(guide_text)} characters)")
print(f"✓ Loaded reference JSON")

✓ Loaded guide (19241 characters)
✓ Loaded reference JSON


## Step 2: Extract Images from PDF

In [4]:
# Extract images and metadata
extractor = ExtractImages(str(PDF_PATH), str(IMAGE_DIR))
image_metadata = extractor.extract_img(str(PDF_PATH))
extractor.save_metadata(str(IMAGE_DIR / "metadata.json"))

print(f"✓ Extracted {len(image_metadata)} images")
for img in image_metadata:
    print(f"  - {img['filename']} (page {img['page_num']}, {img['position']})")

Page 1: 1 image(s)
Page 2: 1 image(s)
Page 3: 1 image(s)
✓ Extracted 3 images
  - page_1_img_1.jpeg (page 1, top-left)
  - page_2_img_1.jpeg (page 2, middle-center)
  - page_3_img_1.png (page 3, middle-center)


In [5]:
# Format image metadata for prompt
image_metadata_text = format_image_metadata(image_metadata)

print("Image metadata for LLM:")
print("=" * 50)
print(image_metadata_text[:500] + "..." if len(image_metadata_text) > 500 else image_metadata_text)

Image metadata for LLM:
Image Reference Guide:

- page_1_img_1.jpeg (id: image_page_1_1): top-left of page 1
  Caption: "CHAPTER 2
THELEGISLATIVEBRANCH(CONGRESS)"
  Context (below: "CHAPTER 2
THELEGISLATIVEBRANCH(CONGRESS)...")

- page_2_img_1.jpeg (id: image_page_2_1): middle-center of page 2, small inline image

- page_3_img_1.png (id: image_page_3_1): middle-center of page 3, small inline image
  Caption: "The Oval Ofﬁce of the White House. Photo by Cecil Stoughton.
Courtesy of the John F. Kennedy Presidential Libra...


## Step 3: Build Prompt

In [6]:
# Build the conversion prompt
prompt = build_conversion_prompt(guide_text, example_json, image_metadata_text=image_metadata_text)

print(f"✓ Built prompt ({len(prompt)} characters)")
print("\nPrompt preview:")
print("=" * 50)
print(prompt[:800] + "...")

✓ Built prompt (24466 characters)

Prompt preview:
Your Role: iTELL Content Authoring Expert
        You are a specialized AI assistant expert in the iTELL framework. Your primary function is to convert source documents into perfectly structured iTELL JSON files.
    Source Files for This Task:

    INSTRUCTIONS:
    ﻿# iTELL Content Authoring Guide

## What is iTELL?

Intelligent Texts for Enhanced Language Learning (iTELL) is a computational framework designed to enable content managers to transform any machine-readable text into interactive, intelligent content within a web app. iTELL leverages theories of reading comprehension to provide opportunities for users to generate knowledge about the materials they engage with through constructed responses and summary writing. These responses and summaries are automatically scored by large lan...


## Step 4: Encode PDF

In [7]:
# Encode PDF to base64
pdf_b64 = encode_pdf_to_base64(PDF_PATH)

print(f"✓ Encoded PDF ({len(pdf_b64)} base64 characters)")

✓ Encoded PDF (1732788 base64 characters)


## Step 5: Initialize API Client

In [8]:
# Set up API client
openai_key = os.getenv("OPENAI_API_KEY")
openrouter_key = os.getenv("OPENROUTER_API_KEY")

api_key = openai_key or openrouter_key
if not api_key:
    raise RuntimeError("Set OPENAI_API_KEY or OPENROUTER_API_KEY in .env file")

# Determine if using OpenRouter
using_openrouter = bool(openrouter_key and not openai_key)

if using_openrouter:
    base_url = os.getenv("OPENROUTER_BASE_URL") or "https://openrouter.ai/api/v1"
    default_headers = {}
    if referer := os.getenv("OPENROUTER_SITE_URL"):
        default_headers["HTTP-Referer"] = referer
    if app_name := os.getenv("OPENROUTER_APP_NAME"):
        default_headers["X-Title"] = app_name
else:
    base_url = os.getenv("OPENAI_BASE_URL") or None
    default_headers = None

client = OpenAIClient(
    model=MODEL,
    api_key=api_key,
    base_url=base_url,
    max_completion_tokens=MAX_TOKENS,
    default_headers=default_headers,
)

print(f"✓ Initialized {'OpenRouter' if using_openrouter else 'OpenAI'} client")
print(f"  Model: {MODEL}")
print(f"  Base URL: {base_url or 'default'}")

✓ Initialized OpenRouter client
  Model: google/gemini-2.5-flash
  Base URL: https://openrouter.ai/api/v1


## Step 6: Generate iTELL JSON

In [9]:
# Call the LLM to generate iTELL JSON
print("Calling LLM... (this may take a minute)")

result_json = client.generate_itell_json(
    pdf_filename=PDF_PATH.name,
    pdf_base64=pdf_b64,
    prompt=prompt
)

print("✓ Generated iTELL JSON")

Calling LLM... (this may take a minute)
✓ Generated iTELL JSON


## Step 7: Display and Save Results

In [10]:
# Parse and display the result
try:
    result_dict = json.loads(result_json)
    print("✓ Valid JSON generated\n")
    print("Preview:")
    print("=" * 50)
    print(json.dumps(result_dict, indent=2)[:1000] + "...")
except json.JSONDecodeError as e:
    print(f"⚠ JSON parsing error: {e}")
    print("\nRaw output:")
    print(result_json[:500])

⚠ JSON parsing error: Expecting value: line 1 column 1 (char 0)

Raw output:
```json
{
  "Title": "The Legislative Branch (Congress)",
  "Description": "This volume introduces learners to the Legislative Branch of the U.S. government, detailing the structure and functions of the House of Representatives and the Senate. It also explains the legislative process for making federal laws, ensuring a comprehensive understanding of how laws are created.",
  "VolumeSummary": "This volume provides a comprehensive overview of the Legislative Branch, also known as the U.S. Congress


In [11]:
# Check for image placeholders
import re

placeholders = re.findall(r'\{\{image_page_\d+_\d+\}\}', result_json)
print(f"\n✓ Found {len(placeholders)} image placeholders:")
for placeholder in set(placeholders):
    print(f"  - {placeholder}")


✓ Found 3 image placeholders:
  - {{image_page_3_1}}
  - {{image_page_2_1}}
  - {{image_page_1_1}}


In [None]:
# Save to file
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH.write_text(result_json, encoding="utf-8")

print(f"\n✓ Saved to {OUTPUT_PATH}")

## Optional: Inspect Specific Sections

In [None]:
# View full JSON with pretty printing
result_dict = json.loads(result_json)
print(json.dumps(result_dict, indent=2))