In [15]:
import os, gc, json
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
import torch
from PyPDF2 import PdfReader

# --- your constants & model setup ---
FILE = "AI.pdf"
output_dir = "pdf_pages"
# 你的 Slide 分析 prompt，當成 system 角色
SYSTEM_PROMPT = """
You are an AI lecture slide analyzer. The following input is an image of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ string, … ]
       }, …
     ]
   }
4. Finally, produce a brief “summary” string of the slide's core message.
Be as thorough and precise as possible—this will be used for later retrieval and generation.
5. When you reply, output *only* the JSON object—no extra words
"""
model_id = "microsoft/Phi-4-multimodal-instruct"

generation_config = GenerationConfig.from_pretrained(model_id)
generation_config.max_new_tokens = 1024
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, device_map="cuda", torch_dtype="auto"
).to("cuda")

def ocr_image(img: Image.Image) -> str:
    return pytesseract.image_to_string(img, lang="chi_tra+eng")

def caption_with_phi4(img: Image.Image, system: str, user: str) -> str:
    full_prompt = (
        "<|im_start|>system<|im_sep|>" + system.strip() + "<|im_end|>"
        "<|im_start|>user<|im_sep|>" + user.strip() + "<|image_1|><|im_end|>"
        "<|im_start|>assistant<|im_sep|>"
    )
    inputs = processor(images=img, text=full_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, generation_config=generation_config)
    return processor.decode(out[0], skip_special_tokens=True)

reader = PdfReader(FILE)
sep = "<|im_sep|>"
print(len(failed_pages))
print(failed_pages)

for page_num, reason in failed_pages:
    print(f"🔄 Reprocessing page {page_num:03d} (prev failure: {reason})")
    # 1. render page → image
    images = convert_from_path(FILE, dpi=200,
                               first_page=page_num, last_page=page_num,
                               use_pdftocairo=True)
    img = images[0]

    # 2. OCR
    ocr_text = ocr_image(img)

    # 3. regenerate caption
    caption = caption_with_phi4(img, SYSTEM_PROMPT, ocr_text)

    # 4. write raw caption
    base = f"page_{page_num:03d}"
    with open(os.path.join(output_dir, base + "_caption.json"), "w", encoding="utf-8") as f:
        f.write(caption)

    # 5. strip out JSON
    idx = caption.rfind(sep)
    if idx == -1:
        print(f"⚠️ Still no separator on page {page_num}")
    else:
        json_str = caption[idx + len(sep):].strip()
        try:
            data = json.loads(json_str)
            with open(os.path.join(output_dir, base + "_caption_strip.json"), "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"✅  page_{page_num:03d}_caption_strip.json written")
        except json.JSONDecodeError as e:
            print(f"❌  JSON still invalid on page {page_num}: {e}")

    # cleanup
    del img, images
    gc.collect()


  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.68s/it]


276
[(3, 'json decode error: Extra data: line 1 column 700 (char 699)'), (5, 'json decode error: Expecting value: line 1 column 1 (char 0)'), (6, 'json decode error: Unterminated string starting at: line 1 column 2323 (char 2322)'), (10, 'json decode error: Extra data: line 1 column 1318 (char 1317)'), (13, 'json decode error: Extra data: line 1 column 745 (char 744)'), (14, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)'), (15, 'json decode error: Unterminated string starting at: line 1 column 2693 (char 2692)'), (18, 'json decode error: Extra data: line 1 column 1120 (char 1119)'), (20, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 174 (char 173)'), (21, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 174 (char 173)'), (23, 'json decode error: Invalid control character at: line 1 column 64 (char 63)'), (24, 'json decode error: Extra data: line 1 column 961 (cha

KeyboardInterrupt: 

In [12]:
import os
import json

output_dir = "pdf_pages"
sep = "<|im_sep|>"

failed_pages = []

for page_num in range(1, 464):
    in_path = os.path.join(output_dir, f"page_{page_num:03d}_caption.json")
    out_path = os.path.join(output_dir, f"page_{page_num:03d}_caption_strip.json")

    # read the raw caption
    try:
        with open(in_path, "r", encoding="utf-8") as f:
            raw = f.read()
    except FileNotFoundError:
        failed_pages.append((page_num, "missing file"))
        continue

    # find the last separator
    idx = raw.rfind(sep)
    if idx == -1:
        failed_pages.append((page_num, "no separator"))
        continue

    # extract the JSON payload
    json_str = raw[idx + len(sep):].strip()

    # parse and validate
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
        failed_pages.append((page_num, f"json decode error: {e}"))
        continue

    # write the stripped JSON
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# After processing:
print("Failed cases:", failed_pages)


Failed cases: [(3, 'json decode error: Extra data: line 1 column 700 (char 699)'), (5, 'json decode error: Expecting value: line 1 column 1 (char 0)'), (6, 'json decode error: Unterminated string starting at: line 1 column 2323 (char 2322)'), (10, 'json decode error: Extra data: line 1 column 1318 (char 1317)'), (13, 'json decode error: Extra data: line 1 column 745 (char 744)'), (14, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)'), (15, 'json decode error: Unterminated string starting at: line 1 column 2693 (char 2692)'), (18, 'json decode error: Extra data: line 1 column 1120 (char 1119)'), (20, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 174 (char 173)'), (21, 'json decode error: Expecting property name enclosed in double quotes: line 1 column 174 (char 173)'), (23, 'json decode error: Invalid control character at: line 1 column 64 (char 63)'), (24, 'json decode error: Extra data: line 1 colum

In [1]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

from pdf2image import convert_from_path
import pytesseract
from langchain.schema import Document
import transformers
print(transformers.__version__)


  from .autonotebook import tqdm as notebook_tqdm


4.48.2


In [None]:
# Define file paths and constants
FILE = "AI.pdf"  # Path to the 463-page PDF
DB_PATH = "./chroma_db_task2"  # Path to store Chroma database
EMBEDDINGS = "all-MiniLM-L6-v2"  # Embedding model
QUERY = "On which page can you find a comparison of two dynamic programming methods for solving Markov Decision Processes (MDPs), focusing on how iterative reward estimation and iterative strategy optimization compute all optimal values while differing in their update processes and policy handling?"
QUERY_ID = "test_001"  # Dummy ID for the test case
output_dir = "pdf_pages"

# 你的 Slide 分析 prompt，當成 system 角色
SYSTEM_PROMPT = """
You are an AI lecture slide analyzer. The following input is an image of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ string, … ]
       }, …
     ]
   }
4. Finally, produce a brief “summary” string of the slide's core message.
Be as thorough and precise as possible—this will be used for later retrieval and generation.
5. When you reply, output *only* the JSON object—no extra words
"""


In [3]:
# import chromadb

# # 用新參數方式，別再用 Settings
# client = chromadb.PersistentClient(
#     path=DB_PATH
# )

# # 取出你原本的 collection
# col = client.get_collection("langchain")

# # 從 offset=0 開始，分批讀所有 documents
# batch_size = 100
# offset = 0
# while True:
#     res = col.get(
#         include=["documents","metadatas"],
#         limit=batch_size,
#         offset=offset
#     )
#     docs = res["documents"]
#     metas = res["metadatas"]
#     if not docs:
#         break

#     for d, m in zip(docs, metas):
#         print(f'page={m["page"]} type={m["type"]} snippet={d[:50]!r}')

#     offset += batch_size


In [None]:
# # —— 先載你原本的 PDF loader & splitter，得到 docs ——  
# loader = PyPDFLoader(FILE)
# pages = loader.load_and_split()  
# splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# docs = splitter.split_documents(pages)

# # save into chroma

Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 54 0 (offset 0)
Ignoring wrong pointing object 56 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
Ignoring wrong pointing object 73 0 (offset 0)
Ignoring wrong pointing object 124 0 (offset 0)
Ignoring wrong pointing object 129 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 136 0 (offset 0)
Ignoring wrong pointing object 147 0 (offset 0)
Ignoring 

In [None]:
import os, gc
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PyPDF2 import PdfReader
import torch

# Set generation config
model_id = "microsoft/Phi-4-multimodal-instruct"

generation_config = GenerationConfig.from_pretrained(model_id)
generation_config.max_new_tokens = 512
generation_config.num_logits_to_keep = 100
do_sample=True
temperature=0.1

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="cuda",
    torch_dtype="auto",
    _attn_implementation="flash_attention_2"
).to("cuda")


`use_fast` is set to `True` but the image processor class does not have a fast version.  Falling back to the slow version.
  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:22<00:00,  7.55s/it]


In [None]:
def ocr_image(img: Image.Image) -> str:
    return pytesseract.image_to_string(img, lang="chi_tra+eng")

def caption_with_phi4(img: Image.Image, system: str, user: str) -> str:
    # Build the exact chat-format prompt for Phi-4
    full_prompt = (
        "<|im_start|>system<|im_sep|>"
        f"{system.strip()}"
        "<|im_end|>"
        "<|im_start|>user<|im_sep|>"
        f"{user.strip()}"
        "<|image_1|>"           # if your processor expects this token for the image
        "<|im_end|>"
        "<|im_start|>assistant<|im_sep|>"
    )

    inputs = processor(images=img, text=full_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            generation_config=generation_config,
            max_new_tokens=generation_config.max_new_tokens,
            num_logits_to_keep=generation_config.num_logits_to_keep,
        )
    # This will include the assistant’s reply up until the next <|im_end|>
    return processor.decode(outputs[0], skip_special_tokens=False)


# 逐頁處理並存檔
reader = PdfReader(FILE)
for page_num in range(1, len(reader.pages) + 1):
    # 1. 轉成影像
    images = convert_from_path(FILE, dpi=200,
                                first_page=page_num, last_page=page_num,
                                use_pdftocairo=True)
    img = images[0]

    # 2. OCR
    ocr_text = ocr_image(img)

    # 3. 呼叫 Phi-4
    caption = caption_with_phi4(img, SYSTEM_PROMPT, ocr_text)
    print(f"[頁 {page_num}] caption:\n{caption}\n")

    # 4. 存檔
    base = f"page_{page_num:03d}"
    img.save(os.path.join(output_dir, base + ".png"))
    with open(os.path.join(output_dir, base + "_caption.json"), "w", encoding="utf-8") as f:
        f.write(caption)
    with open(os.path.join(output_dir, base + "_ocr.txt"), "w", encoding="utf-8") as f:
        f.write(ocr_text)

    # 5. 清理
    del img, images
    gc.collect()




[頁 1] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 2] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 3] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 4] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 5] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 6] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 7] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 8] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



[頁 9] caption:
<|im_start|>system<|im_sep|>:
You are an AI lecture slide analyzer. The following input is an image and the ocr of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ strin



KeyboardInterrupt: 

In [18]:
def ocr_image(img: Image.Image) -> str:
    return pytesseract.image_to_string(img, lang="chi_tra+eng")

def caption_with_phi4(img: Image.Image, system: str, user: str) -> str:
    # 把 system 和 user 角色串在一起，並加上 image token
    full_prompt = (
        "<|image_1|>\n"
        "### System:\n" + system.strip() + "\n\n"
        "### User:\n" + user.strip() + "\n\n"
        "### Assistant:"
    )
    inputs = processor(images=img, text=full_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            generation_config=generation_config,
            max_new_tokens=generation_config.max_new_tokens,
            num_logits_to_keep=generation_config.num_logits_to_keep,
        )
    return processor.decode(outputs[0], skip_special_tokens=True)

# 逐頁處理並存檔
reader = PdfReader(FILE)
for page_num in range(1, len(reader.pages) + 1):
    # 1. 轉成影像
    images = convert_from_path(FILE, dpi=200,
                               first_page=page_num, last_page=page_num,
                               use_pdftocairo=True)
    img = images[0]

    # 2. OCR
    ocr_text = ocr_image(img)

    # 3. 呼叫 Phi-4
    caption = caption_with_phi4(img, SYSTEM_PROMPT, ocr_text)
    print(f"[頁 {page_num}] caption:\n{caption}\n")

    # 4. 存檔
    base = f"page_{page_num:03d}"
    img.save(os.path.join(output_dir, base + ".png"))
    with open(os.path.join(output_dir, base + "_caption.json"), "w", encoding="utf-8") as f:
        f.write(caption)
    with open(os.path.join(output_dir, base + "_ocr.txt"), "w", encoding="utf-8") as f:
        f.write(ocr_text)

    # 5. 清理
    del img, images
    gc.collect()


  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  4.34s/it]


KeyboardInterrupt: 

In [5]:
def caption_with_phi4(img: Image.Image, prompt: str) -> str:
    # print(prompt)
    inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda:0")
    print(generation_config.num_logits_to_keep)
    # Generate caption
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            num_logits_to_keep=generation_config.num_logits_to_keep,
            max_new_tokens=generation_config.max_new_tokens,
            generation_config=generation_config
        )
        
    return processor.decode(outputs[0], skip_special_tokens=True)

def ocr_image(img: Image.Image) -> str:
    return pytesseract.image_to_string(img, lang="chi_tra+eng")

# ——— 逐頁處理並存檔 ———
output_dir = "pdf_pages"
os.makedirs(output_dir, exist_ok=True)

reader = PdfReader(FILE)
total_pages = len(reader.pages)

for page_num in range(1, total_pages + 1):
    # 1. 轉成影像
    images = convert_from_path(
        FILE,
        dpi=200,
        first_page=page_num,
        last_page=page_num,
        use_pdftocairo=True
    )
    img = images[0]

    # 2. 分析 prompt：加上頁碼提示
    prompt = f" <|image_1|> \nSlide page {page_num}:\n" + PROMPT_TEMPLATE

    # 3. Phi-4 Multimodal 產生 JSON 結構化描述
    cap = caption_with_phi4(img, prompt)
    print(f"[頁 {page_num}] caption: {cap}")
    # 4. OCR 文字擷取
    txt = ocr_image(img)

    # 5. 存檔
    img_path = os.path.join(output_dir, f"page_{page_num:03d}.png")
    cap_path = os.path.join(output_dir, f"page_{page_num:03d}_caption.json")
    ocr_path = os.path.join(output_dir, f"page_{page_num:03d}_ocr.txt")

    img.save(img_path, format="PNG")
    with open(cap_path, "w", encoding="utf-8") as f:
        f.write(cap)  # Phi-4 回傳的 JSON-like text
    with open(ocr_path, "w", encoding="utf-8") as f:
        f.write(txt)

    print(f"[頁 {page_num}] image→{img_path}, caption→{cap_path}, ocr→{ocr_path}")

    # 6. 釋放記憶體
    del img, images
    gc.collect()


100




[頁 1] caption:   
Slide page 1:

You are an AI lecture slide analyzer. The following input is an image of a lecture slide about “Artificial Intelligence.” 
1. Extract every piece of written content:
   • Slide title
   • Section or bullet headings
   • Sub-bullets and their full text
   • Definitions, formulas, and any inline examples
2. Describe every visual element:
   • Diagrams or charts (list each shape/box, arrow, label, and the relationship they depict)
   • Icons or figures (what they represent and any attached caption)
3. Organize your output as a JSON object with these fields:
   {
     "title": string,
     "bullets": [ { "level": int, "text": string }, … ],
     "definitions": { term: definition, … },
     "formulas": [ string, … ],
     "examples": [ string, … ],
     "diagrams": [
       {
         "type": string,
         "elements": [
           { "shape": string, "label": string, "notes": string }, …
         ],
         "relationships": [ string, … ]
       }, …
     



KeyboardInterrupt: 

In [None]:

# Step 3: Load LLM and configure tokenizer
print("Loading Phi-2 model...")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

# Fix pad_token_id warning
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    tokenizer.pad_token_id = tokenizer.eos_token_id  # e.g., 50256
model.config.pad_token_id = tokenizer.pad_token_id

# Step 4: Process the test query
print(f"Processing query: {QUERY}")
# Retrieve top-5 chunks
retrieved_docs = retriever.invoke(QUERY)
retrieved_texts = [doc.page_content for doc in retrieved_docs]
retrieved_pages = [doc.metadata["page"] for doc in retrieved_docs]

# Create prompt
prompt = f"""
Query: {QUERY}
Retrieved Documents:
{chr(10).join([f"Page {p}: {t}" for p, t in zip(retrieved_pages, retrieved_texts)])}
Instructions: Select the page number (1-463) that directly addresses the query, focusing on technical definitions or metrics. Output only the page number.
"""

# Generate response
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)
page_number = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Validate output
try:
    page_number = int(page_number)
    if not 1 <= page_number <= 463:
        raise ValueError
except ValueError:
    print("Invalid page number, falling back to top-ranked chunk")
    page_number = retrieved_pages[0]

# Step 5: Output result
print(f"Query ID: {QUERY_ID}")
print(f"Predicted Page Number: {page_number}")

# Step 6: Save result to CSV (for test case)
result = [{"id": QUERY_ID, "page": page_number}]
submission = pd.DataFrame(result)
submission.to_csv("test_submission.csv", index=False)
print("Result saved to test_submission.csv")

In [None]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define file paths and constants
FILE = "AI.pdf"  # Path to the 463-page PDF
QUERY_FILE = "HW2_query.csv"  # Path to query CSV
DB_PATH = "./chroma_db"  # Path to store Chroma database
EMBEDDINGS = "all-MiniLM-L6-v2"  # Embedding model
OUTPUT_FILE = "submission.csv"  # Output CSV for Kaggle

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# Step 1: Preprocess PDF
logger.info("Loading and splitting PDF...")
try:
    loader = PyPDFLoader(FILE)
    pages = loader.load_and_split()
except Exception as e:
    logger.error(f"Failed to load PDF: {e}")
    raise

# Check for empty or problematic pages
for i, page in enumerate(pages):
    if not page.page_content or len(page.page_content.strip()) < 10:
        logger.warning(f"Page {i+1} has empty or minimal content. Check PDF integrity.")

# Split pages into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)

# Verify page metadata
for doc in docs:
    if "page" not in doc.metadata:
        logger.error("Page number metadata missing in a document chunk")
        raise ValueError("Page number metadata missing")

# Step 2: Create vector store
logger.info("Generating embeddings and creating vector store...")
embedding = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
try:
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embedding,
        persist_directory=DB_PATH,
        collection_name="langchain"
    )
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})
except Exception as e:
    logger.error(f"Failed to create vector store: {e}")
    raise

# Step 3: Load LLM and configure tokenizer
logger.info("Loading Phi-2 model...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float16
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
except Exception as e:
    logger.error(f"Failed to load Phi-2 model: {e}")
    raise

# Fix pad_token_id warning
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Verify model device
logger.info(f"Model device: {next(model.parameters()).device}")

# Step 4: Load queries
logger.info(f"Loading queries from {QUERY_FILE}...")
try:
    queries = pd.read_csv(QUERY_FILE)
    query_texts = queries["Question"].tolist()
    query_ids = queries["ID"].tolist()
except Exception as e:
    logger.error(f"Failed to load queries: {e}")
    raise

# Step 5: Process all queries
logger.info("Processing queries...")
results = []
for query_text, query_id in zip(query_texts, query_ids):
    logger.info(f"Processing query ID: {query_id}")

    # Retrieve top-5 chunks
    try:
        retrieved_docs = retriever.invoke(query_text)
    except Exception as e:
        logger.error(f"Retrieval failed for query ID {query_id}: {e}")
        results.append({"id": query_id, "page": 1})  # Fallback page
        continue

    if not retrieved_docs:
        logger.warning(f"No documents retrieved for query ID {query_id}. Using fallback page.")
        results.append({"id": query_id, "page": 1})  # Fallback page
        continue

    retrieved_texts = [doc.page_content for doc in retrieved_docs]
    retrieved_pages = [doc.metadata["page"] for doc in retrieved_docs]
    logger.info(f"Retrieved pages for query ID {query_id}: {retrieved_pages}")

    # Create prompt
    prompt = f"""
Query: {query_text}
Retrieved Documents:
{chr(10).join([f"Page {p}: {t}" for p, t in zip(retrieved_pages, retrieved_texts)])}
Instructions: Select the page number (1-463) that directly addresses the query, focusing on technical definitions or metrics. Output only the page number.
"""

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding=True).to(device)
    logger.info(f"Inputs device for query ID {query_id}: {inputs['input_ids'].device}")

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        page_number = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    except Exception as e:
        logger.error(f"Generation failed for query ID {query_id}: {e}")
        page_number = str(retrieved_pages[0])  # Fallback to top-ranked chunk

    # Validate output
    try:
        page_number = int(page_number)
        if not 1 <= page_number <= 463:
            raise ValueError
    except ValueError:
        logger.warning(f"Invalid page number for query ID {query_id}, falling back to top-ranked chunk")
        page_number = retrieved_pages[0]

    results.append({"ID": query_id, "Answer": page_number})

# Step 6: Save results to CSV
logger.info(f"Saving results to {OUTPUT_FILE}...")
submission = pd.DataFrame(results)
submission.to_csv(OUTPUT_FILE, index=False)
print(f"Results saved to {OUTPUT_FILE}")