# Generate Caption

In [None]:
import os
import gc
import json
import re
from pdf2image import convert_from_path
from PIL import Image
# import pytesseract  # uncomment if you need OCR
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
from PyPDF2 import PdfReader
import torch

# --- Configuration ---
FILE = "AI.pdf"                 # Path to your PDF
IMG_DIR = "pdf_pages"        # Where to save images
OUTPUT_DIR = "pdf_pages2"        # Where to save images and JSON
MODEL_ID = "microsoft/Phi-4-multimodal-instruct"

SYSTEM_PROMPT = """
You are an AI lecture slide analyzer. The following input is an image of a lecture slide about “Artificial Intelligence.”
1. Extract every piece of written content: slide title, sub-bullets and their full text, definitions, formulas, and any inline examples.
2. Generate a summary of at least 300 words, as thorough and precise as possible, for retrieval and generation.
3. Select at least 5 of the most relevant keywords from the slide.
4. If the slide shows a plot, include a description of the plot.
5. If the slide contains a formula, save it in LaTeX format, and describe the formula as thorough as possible.
"""

# --- Model Setup ---
generation_config = GenerationConfig.from_pretrained(MODEL_ID)
generation_config.max_new_tokens = 1024
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    device_map="cuda",
    torch_dtype="auto",
    _attn_implementation="flash_attention_2",
).to("cuda")

def caption_with_phi4(img: Image.Image, system: str) -> str:
    prompt = (
        "<|im_start|>system<|im_sep|>"
        + system.strip()
        + "<|im_start|>user<|im_sep|>I'm a student learning artificial intelligence, teach me every thing in this slide.<|im_end|>"
        + "<|image_1|><|im_end|>"
        + "<|im_start|>assistant<|im_sep|>"
    )
    inputs = processor(images=img, text=prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.2,     # 保持一致性
            num_beams=2,          # 提高质量
            no_repeat_ngram_size=3
        )
    return processor.decode(outputs[0], skip_special_tokens=True)

def extract_json(raw_caption: str) -> str:
    # 1) Try regex for the assistant block
    m = re.search(r"<\|im_start\|>assistant<\|im_sep\|>(\{[\s\S]*\})", raw_caption)
    if m:
        return m.group(1).strip()
    # 2) Fallback: split on the last separator
    parts = raw_caption.rsplit("<|im_start|>assistant<|im_sep|>", 1)
    return parts[-1].strip()

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

failed = []
# --- Main Loop: process each page ---
reader = PdfReader(FILE)

for page_num in range(1, len(reader.pages) + 1):
    # read img 
    img = Image.open(IMG_DIR + f"/page_{page_num:03d}.png")
    base = f"page_{page_num:03d}"

    # 2. Generate raw caption
    raw_caption = caption_with_phi4(img, SYSTEM_PROMPT)
    # save raw caption
    with open(os.path.join(OUTPUT_DIR, base + "_caption.txt"), "w", encoding="utf-8") as f:
        f.write(raw_caption)

    outputstr = extract_json(raw_caption)
    print(f"outputstr: {outputstr}")
    # 4. Parse (to verify) and save raw JSON text into .txt
    try:
        # save the raw JSON string into a .txt file
        txt_path = os.path.join(OUTPUT_DIR, base + "_caption_strip.txt")
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(outputstr)

        print(f"✅  page_{page_num:03d} processed, saved to {txt_path}")

    except json.JSONDecodeError as e:
        print(f"❌ page_{page_num:03d} decode error: {e}")
        failed.append(page_num)
    finally:
        # cleanup
        del img
        gc.collect()


# save failed into txt
with open(os.path.join(OUTPUT_DIR, "failed.txt"), "w") as f:
    for page_num in failed:
        f.write(f"{page_num}\n")

# Save into chromadb

In [4]:
# File: build_chromadb.py
import os
import json
import re
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Configuration
OUTPUT_DIR = "pdf_pages2"
DB_PATH = "./chroma_db_task2-2"
EMBEDDINGS = "all-MiniLM-L6-v2"


docs = []
for fname in sorted(os.listdir(OUTPUT_DIR)):
    if not fname.endswith("_caption_strip.txt"):
        continue
    print("loading", fname)
    page = int(fname.split("_")[1])
    path = os.path.join(OUTPUT_DIR, fname)

    # 讀入原始 JSON 字串
    with open(path, encoding="utf-8") as f:
        data = f.read()

    # 組成要存入向量庫的純文字
    text = f"Page {page}\Caption: {data}"

    docs.append(Document(page_content=data, metadata={"page": page}))

# 建立並 persist Chroma 向量庫
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=DB_PATH
)
vectordb.persist()

print(f"Persisted {len(docs)} documents into {DB_PATH}")


loading page_001_caption_strip.txt
loading page_002_caption_strip.txt
loading page_003_caption_strip.txt
loading page_004_caption_strip.txt
loading page_005_caption_strip.txt
loading page_006_caption_strip.txt
loading page_007_caption_strip.txt
loading page_008_caption_strip.txt
loading page_009_caption_strip.txt
loading page_010_caption_strip.txt
loading page_011_caption_strip.txt
loading page_012_caption_strip.txt
loading page_013_caption_strip.txt
loading page_014_caption_strip.txt
loading page_015_caption_strip.txt
loading page_016_caption_strip.txt
loading page_017_caption_strip.txt
loading page_018_caption_strip.txt
loading page_019_caption_strip.txt
loading page_020_caption_strip.txt
loading page_021_caption_strip.txt
loading page_022_caption_strip.txt
loading page_023_caption_strip.txt
loading page_024_caption_strip.txt
loading page_025_caption_strip.txt
loading page_026_caption_strip.txt
loading page_027_caption_strip.txt
loading page_028_caption_strip.txt
loading page_029_cap

  vectordb.persist()


In [None]:
# parse through 103-174, remove everything
for i in range(103, 175):
    with open(os.path.join(OUTPUT_DIR, f"page_{i:03d}_caption_strip.txt"), "w", encoding="utf-8") as f:
        f.write("")

# Run queries

In [None]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# 1. Load your CSV as a Dataset
ds = load_dataset("csv", data_files={"queries": "queries.csv"})["queries"]

# 2. Initialize your retriever (once, on CPU) and your HF pipeline (on GPU)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectordb   = Chroma(persist_directory="./chroma_db_task2-2", embedding_function=embeddings)
retriever  = vectordb.as_retriever(search_kwargs={"k": 5})

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-multimodal-instruct",
                                          trust_remote_code=True, use_fast=True)
model     = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct",
                                                 trust_remote_code=True).to("cuda")
hf_pipe   = pipeline("text-generation", model=model, tokenizer=tokenizer,
                     trust_remote_code=True, device=0, return_full_text=False)

# 3. Precompute your contexts (summaries) column
def add_context(example):
    docs = retriever.get_relevant_documents(example["Question"])
    example["context"] = "\n".join(d.page_content for d in docs)
    return example

ds = ds.map(add_context)

# 4. Define your batched generation function
prompt_tmpl = """\
You have extracts from multiple pages, each prefixed with "Page <number>:".  
Use all of them to answer the user's question, but output only the integer page number  
of the page that best answers the question. No extra text.

{context}

Question: {question}
Answer:"""

def generate_batch(batch):
    prompts = [
        prompt_tmpl.format(context=c, question=q)
        for c, q in zip(batch["context"], batch["Question"])
    ]
    outputs = hf_pipe(prompts, batch_size=8, max_new_tokens=32)
    batch["Answer"] = [out["generated_text"].strip() for out in outputs]
    return batch

# 5. Run it all in one go, on GPU
ds = ds.map(generate_batch, batched=True, batch_size=8)

# 6. Save
ds.to_csv("submission.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm
Generating queries split: 200 examples [00:00, 35874.82 examples/s]
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  vectordb   = Chroma(persist_directory="./chroma_db_task2-2", embedding_function=embeddings)
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
  lambda i: encoder_checkpoint_wrapper(
Loading checkpoint shards: 100%|██████████| 3/3 [00:32<00:00, 10.67s/it]
Device set to use cuda:0
  docs = retriever.get_relevant_documents(example["Question"])
Map: 100%|██████████| 200/200 [00:02<00:00, 77.27 examples/s] 
