# Generate Caption

In [None]:
import os
import gc
import json
import re
from pdf2image import convert_from_path
from PIL import Image
# import pytesseract  # uncomment if you need OCR
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
from PyPDF2 import PdfReader
import torch

# --- Configuration ---
FILE = "AI.pdf"                 # Path to your PDF
OUTPUT_DIR = "pdf_pages"        # Where to save images and JSON
MODEL_ID = "microsoft/Phi-4-multimodal-instruct"

SYSTEM_PROMPT = """
You are an AI lecture slide analyzer. The following input is an image and the OCR of a lecture slide about “Artificial Intelligence.”
1. Extract every piece of written content: slide title, sub-bullets and their full text, definitions, formulas, and any inline examples.
2. Generate a summary of at least 300 words, as thorough and precise as possible, for retrieval and generation.
3. Select at least 5 of the most relevant keywords from the slide.
4. If the slide shows a plot, include a description of the plot.
5. Organize your output as a valid JSON object with these fields:
   {
     "title": string,
     "summary": string,
     "definitions": { term: definition },
     "keywords": [ string ],
     "formulas": [ string ]
   }
6. Output ONLY the JSON object as a string, with no extra text or formatting.
"""

# --- Model Setup ---
generation_config = GenerationConfig.from_pretrained(MODEL_ID)
generation_config.max_new_tokens = 1024
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    device_map="cuda",
    torch_dtype="auto",
    _attn_implementation="flash_attention_2"
).to("cuda")

def caption_with_phi4(img: Image.Image, system: str) -> str:
    prompt = (
        "<|im_start|>system<|im_sep|>"
        + system.strip()
        + "<|image_1|><|im_end|>"
        + "<|im_start|>assistant<|im_sep|>"
    )
    inputs = processor(images=img, text=prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=1024,
            temperature=0.0,     # 保持一致性
            num_beams=5,          # 提高质量
            no_repeat_ngram_size=3
        )
    return processor.decode(outputs[0], skip_special_tokens=True)

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

failed = []
# --- Main Loop: process each page ---
reader = PdfReader(FILE)
for page_num in range(1, len(reader.pages) + 1):

    # read img 
    img = Image.open(OUTPUT_DIR + f"/page_{page_num:03d}.png")
    base = f"page_{page_num:03d}"

    # 2. Generate raw caption
    raw_caption = caption_with_phi4(img, SYSTEM_PROMPT)
    # save raw caption
    with open(os.path.join(OUTPUT_DIR, base + "_caption.json"), "w", encoding="utf-8") as f:
        f.write(raw_caption)

    # 3. Extract JSON payload
    #   a) Try to capture fenced ```json ... ``` block
    m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", raw_caption)
    if m:
        json_str = m.group(1)
    else:
        # b) Fallback: grab from first '{' to last '}'
        start = raw_caption.find('{')
        end   = raw_caption.rfind('}') + 1
        if start != -1 and end != -1:
            json_str = raw_caption[start:end]
        else:
            print(f"❌ page_{page_num:03d}: could not locate JSON block")
            continue

    # 4. Parse and save
    try:
        data = json.loads(json_str)

        # save stripped JSON
        with open(os.path.join(OUTPUT_DIR, base + "_caption_strip.json"), "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        # save page image
        img.save(os.path.join(OUTPUT_DIR, base + ".png"))
        print(f"✅  page_{page_num:03d} processed")

    except json.JSONDecodeError as e:
        print(f"❌ page_{page_num:03d} JSON decode error: {e}")
        failed.append(page_num)
        # optionally bump generation_config.max_new_tokens and retry here
    finally:
        # cleanup
        del img
        gc.collect()

# save failed into txt
with open(os.path.join(OUTPUT_DIR, "failed.txt"), "w") as f:
    for page_num in failed:
        f.write(f"{page_num}\n")

# Save into chromadb

In [1]:
# File: build_chromadb.py
import os
import json
import re
from langchain.schema import Document
# 把下面這行由 langchain_chroma 改成官方的 langchain.vectorstores
from langchain.vectorstores import Chroma
# Embeddings 也改成官方路徑
from langchain.embeddings import HuggingFaceEmbeddings

# Configuration
OUTPUT_DIR = "pdf_pages"
DB_PATH = "./chroma_db_task2"
EMBEDDINGS = "all-MiniLM-L6-v2"

def sanitize_json(raw: str) -> str:
    # 先修正所有不合法的 \uXXXX，再把其它非法反斜線全逃逸
    raw = re.sub(r'\\u(?![0-9A-Fa-f]{4})', r'\\\\u', raw)
    raw = re.sub(r'\\(?!["\\/bfnrtu])',      r'\\\\', raw)
    return raw

docs = []
for fname in sorted(os.listdir(OUTPUT_DIR)):
    if not fname.endswith("_caption_strip.json"):
        continue
    print("loading", fname)
    page = int(fname.split("_")[1])
    path = os.path.join(OUTPUT_DIR, fname)

    with open(path, encoding="utf-8") as f:
        raw = f.read()
    safe = sanitize_json(raw)
    data = json.loads(safe)

    text = f"Page {page}\nTitle: {data.get('title','')}\n\nSummary:\n{data.get('summary','')}"
    docs.append(Document(page_content=text, metadata={"page": page}))

# 這裡改成官方 Chroma.from_documents
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=DB_PATH
)
# 呼叫官方的 persist() 方法
vectordb.persist()

print(f"Persisted {len(docs)} documents into {DB_PATH}")


loading page_001_caption_strip.json
loading page_002_caption_strip.json
loading page_003_caption_strip.json
loading page_004_caption_strip.json
loading page_005_caption_strip.json
loading page_006_caption_strip.json
loading page_007_caption_strip.json
loading page_008_caption_strip.json
loading page_009_caption_strip.json
loading page_010_caption_strip.json
loading page_011_caption_strip.json
loading page_012_caption_strip.json
loading page_013_caption_strip.json
loading page_014_caption_strip.json
loading page_015_caption_strip.json
loading page_016_caption_strip.json
loading page_017_caption_strip.json
loading page_018_caption_strip.json
loading page_019_caption_strip.json
loading page_020_caption_strip.json
loading page_021_caption_strip.json
loading page_022_caption_strip.json
loading page_023_caption_strip.json
loading page_024_caption_strip.json
loading page_025_caption_strip.json
loading page_026_caption_strip.json
loading page_027_caption_strip.json
loading page_028_caption_str

  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
  from .autonotebook import tqdm as notebook_tqdm


Persisted 462 documents into ./chroma_db_task2


  vectordb.persist()


# Run queries

In [6]:
# File: run_queries.py
import os
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Configuration
DB_PATH        = "./chroma_db_task2"
QUERIES_CSV    = "queries.csv"
SUBMISSION_CSV = "submission.csv"
MODEL_ID       = "microsoft/Phi-4-multimodal-instruct"
EMBEDDINGS     = "all-MiniLM-L6-v2"

# 1. Load persisted Chroma vector store
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
vectordb   = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
retriever  = vectordb.as_retriever(search_kwargs={"k": 20})

# 2. Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID, trust_remote_code=True, use_fast=True
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, trust_remote_code=True,
    device_map="cuda", torch_dtype="auto"
)

# 3. Build a HF text-generation pipeline
hf_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="cuda",
    torch_dtype="auto",
    return_full_text=False
)

# 4. Wrapper class to inject input_mode=0 while preserving .task
class PipelineWithInputMode:
    def __init__(self, pipe, input_mode=0):
        self.pipeline = pipe
        self.task = pipe.task  # ensure .task exists
        self.input_mode = input_mode

    def __call__(self, texts, **kwargs):
        # 每次调用都加上 input_mode 参数
        return self.pipeline(texts, input_mode=self.input_mode, **kwargs)

# 5. Create LangChain LLM with wrapped pipeline
wrapped = PipelineWithInputMode(hf_pipe, input_mode=0)
llm = HuggingFacePipeline(pipeline=wrapped)

# 6. Build RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# 7. Read queries CSV and answer
df      = pd.read_csv(QUERIES_CSV)
results = []
for _, row in df.iterrows():
    qid      = row['ID']
    question = row['Question']
    print(f"Processing {qid}")
    output   = qa({"query": question})
    docs     = output["source_documents"]
    page     = docs[0].metadata.get("page") if docs else None
    results.append({"ID": qid, "Answer": page})

# 8. Save submission.csv
pd.DataFrame(results).to_csv(SUBMISSION_CSV, index=False, encoding='utf-8-sig')
print(f"Wrote submission to {SUBMISSION_CSV}")


  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]
Device set to use cuda


Processing 0
Processing 1
Processing 2
Processing 3
Processing 4
Processing 5
Processing 6
Processing 7
Processing 8
Processing 9
Processing 10
Processing 11
Processing 12
Processing 13
Processing 14
Processing 15
Processing 16
Processing 17
Processing 18
Processing 19
Processing 20
Processing 21
Processing 22
Processing 23
Processing 24
Processing 25
Processing 26
Processing 27
Processing 28
Processing 29
Processing 30
Processing 31
Processing 32
Processing 33
Processing 34
Processing 35
Processing 36
Processing 37
Processing 38
Processing 39
Processing 40
Processing 41
Processing 42
Processing 43
Processing 44
Processing 45
Processing 46
Processing 47
Processing 48
Processing 49
Processing 50
Processing 51
Processing 52
Processing 53
Processing 54
Processing 55
Processing 56
Processing 57
Processing 58
Processing 59
Processing 60
Processing 61
Processing 62
Processing 63
Processing 64
Processing 65
Processing 66
Processing 67
Processing 68
Processing 69
Processing 70
Processing 71
Pr

In [7]:
# 假設已經有 output = qa({"query": question})
docs = output["source_documents"]

print("=== INPUT ===")
print(question)

print("\n=== ALL SOURCE DOCUMENTS ===")
for i, doc in enumerate(docs, 1):
    print(f"\n--- Source Document #{i} ---")
    print("Page metadata:", doc.metadata.get("page"))
    print("Content snippet:")
    print(doc.page_content[:200].replace("\n", " "), "...\n")

print("=== OUTPUT ===")
print(output["result"])


=== INPUT ===
On which page can you find information about the approach where reasoning involves answer inference conditioned on a rationale, contrasting it with direct prediction methods, and noting a performance drop exceeding 12%?

=== ALL SOURCE DOCUMENTS ===

--- Source Document #1 ---
Page metadata: 223
Content snippet:
Page 223 Title: Multimodal CoT Reasoning  Summary: The slide discusses the impact of rationales on answer prediction in multimodal CoT reasoning. It provides an example where the No-CoT method predict ...


--- Source Document #2 ---
Page metadata: 193
Content snippet:
Page 193 Title: This Lecture – Agenda  Summary: The lecture agenda covers the basics of reasoning, including Chain-of-Thought (CoT) with zero-shot and few-shot reasoners, Analogical Reasoning, Multimo ...


--- Source Document #3 ---
Page metadata: 212
Content snippet:
Page 212 Title: This Lecture – Agenda  Summary: The lecture agenda includes topics on reasoning basics, chain-of-thought, zero-shot 