In [None]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Define file paths and constants
FILE = "AI.pdf"  # Path to the 463-page PDF
DB_PATH = "./chroma_db"  # Path to store Chroma database
EMBEDDINGS = "all-MiniLM-L6-v2"  # Embedding model
QUERY = "On which page can you find a comparison of two dynamic programming methods for solving Markov Decision Processes (MDPs), focusing on how iterative reward estimation and iterative strategy optimization compute all optimal values while differing in their update processes and policy handling?"
QUERY_ID = "test_001"  # Dummy ID for the test case

# Step 1: Preprocess PDF
print("Loading and splitting PDF...")
loader = PyPDFLoader(FILE)
pages = loader.load_and_split()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)

# Verify page metadata
for doc in docs:
    assert "page" in doc.metadata, "Page number metadata missing"

# Step 2: Create vector store
print("Generating embeddings and creating vector store...")
embedding = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=DB_PATH,
    collection_name="langchain"
)
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

# Step 3: Load LLM and configure tokenizer
print("Loading Phi-2 model...")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

# Fix pad_token_id warning
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    tokenizer.pad_token_id = tokenizer.eos_token_id  # e.g., 50256
model.config.pad_token_id = tokenizer.pad_token_id

# Step 4: Process the test query
print(f"Processing query: {QUERY}")
# Retrieve top-5 chunks
retrieved_docs = retriever.invoke(QUERY)
retrieved_texts = [doc.page_content for doc in retrieved_docs]
retrieved_pages = [doc.metadata["page"] for doc in retrieved_docs]

# Create prompt
prompt = f"""
Query: {QUERY}
Retrieved Documents:
{chr(10).join([f"Page {p}: {t}" for p, t in zip(retrieved_pages, retrieved_texts)])}
Instructions: Select the page number (1-463) that directly addresses the query, focusing on technical definitions or metrics. Output only the page number.
"""

# Generate response
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)
page_number = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Validate output
try:
    page_number = int(page_number)
    if not 1 <= page_number <= 463:
        raise ValueError
except ValueError:
    print("Invalid page number, falling back to top-ranked chunk")
    page_number = retrieved_pages[0]

# Step 5: Output result
print(f"Query ID: {QUERY_ID}")
print(f"Predicted Page Number: {page_number}")

# Step 6: Save result to CSV (for test case)
result = [{"id": QUERY_ID, "page": page_number}]
submission = pd.DataFrame(result)
submission.to_csv("test_submission.csv", index=False)
print("Result saved to test_submission.csv")

Loading and splitting PDF...


Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 54 0 (offset 0)
Ignoring wrong pointing object 56 0 (offset 0)
Ignoring wrong pointing object 58 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
Ignoring wrong pointing object 73 0 (offset 0)
Ignoring wrong pointing object 124 0 (offset 0)
Ignoring wrong pointing object 129 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 136 0 (offset 0)
Ignoring wrong pointing object 147 0 (offset 0)
Ignoring 

Generating embeddings and creating vector store...
Loading Phi-2 model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.72s/it]


Processing query: On which page can you find a comparison of two dynamic programming methods for solving Markov Decision Processes (MDPs), focusing on how iterative reward estimation and iterative strategy optimization compute all optimal values while differing in their update processes and policy handling?
Invalid page number, falling back to top-ranked chunk
Query ID: test_001
Predicted Page Number: 304
Result saved to test_submission.csv


In [1]:
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define file paths and constants
FILE = "AI.pdf"  # Path to the 463-page PDF
QUERY_FILE = "HW2_query.csv"  # Path to query CSV
DB_PATH = "./chroma_db"  # Path to store Chroma database
EMBEDDINGS = "all-MiniLM-L6-v2"  # Embedding model
OUTPUT_FILE = "submission.csv"  # Output CSV for Kaggle

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# Step 1: Preprocess PDF
logger.info("Loading and splitting PDF...")
try:
    loader = PyPDFLoader(FILE)
    pages = loader.load_and_split()
except Exception as e:
    logger.error(f"Failed to load PDF: {e}")
    raise

# Check for empty or problematic pages
for i, page in enumerate(pages):
    if not page.page_content or len(page.page_content.strip()) < 10:
        logger.warning(f"Page {i+1} has empty or minimal content. Check PDF integrity.")

# Split pages into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)

# Verify page metadata
for doc in docs:
    if "page" not in doc.metadata:
        logger.error("Page number metadata missing in a document chunk")
        raise ValueError("Page number metadata missing")

# Step 2: Create vector store
logger.info("Generating embeddings and creating vector store...")
embedding = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
try:
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embedding,
        persist_directory=DB_PATH,
        collection_name="langchain"
    )
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})
except Exception as e:
    logger.error(f"Failed to create vector store: {e}")
    raise

# Step 3: Load LLM and configure tokenizer
logger.info("Loading Phi-2 model...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        torch_dtype=torch.float16
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
except Exception as e:
    logger.error(f"Failed to load Phi-2 model: {e}")
    raise

# Fix pad_token_id warning
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Verify model device
logger.info(f"Model device: {next(model.parameters()).device}")

# Step 4: Load queries
logger.info(f"Loading queries from {QUERY_FILE}...")
try:
    queries = pd.read_csv(QUERY_FILE)
    query_texts = queries["Question"].tolist()
    query_ids = queries["ID"].tolist()
except Exception as e:
    logger.error(f"Failed to load queries: {e}")
    raise

# Step 5: Process all queries
logger.info("Processing queries...")
results = []
for query_text, query_id in zip(query_texts, query_ids):
    logger.info(f"Processing query ID: {query_id}")

    # Retrieve top-5 chunks
    try:
        retrieved_docs = retriever.invoke(query_text)
    except Exception as e:
        logger.error(f"Retrieval failed for query ID {query_id}: {e}")
        results.append({"id": query_id, "page": 1})  # Fallback page
        continue

    if not retrieved_docs:
        logger.warning(f"No documents retrieved for query ID {query_id}. Using fallback page.")
        results.append({"id": query_id, "page": 1})  # Fallback page
        continue

    retrieved_texts = [doc.page_content for doc in retrieved_docs]
    retrieved_pages = [doc.metadata["page"] for doc in retrieved_docs]
    logger.info(f"Retrieved pages for query ID {query_id}: {retrieved_pages}")

    # Create prompt
    prompt = f"""
Query: {query_text}
Retrieved Documents:
{chr(10).join([f"Page {p}: {t}" for p, t in zip(retrieved_pages, retrieved_texts)])}
Instructions: Select the page number (1-463) that directly addresses the query, focusing on technical definitions or metrics. Output only the page number.
"""

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024, padding=True).to(device)
    logger.info(f"Inputs device for query ID {query_id}: {inputs['input_ids'].device}")

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        page_number = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    except Exception as e:
        logger.error(f"Generation failed for query ID {query_id}: {e}")
        page_number = str(retrieved_pages[0])  # Fallback to top-ranked chunk

    # Validate output
    try:
        page_number = int(page_number)
        if not 1 <= page_number <= 463:
            raise ValueError
    except ValueError:
        logger.warning(f"Invalid page number for query ID {query_id}, falling back to top-ranked chunk")
        page_number = retrieved_pages[0]

    results.append({"ID": query_id, "Answer": page_number})

# Step 6: Save results to CSV
logger.info(f"Saving results to {OUTPUT_FILE}...")
submission = pd.DataFrame(results)
submission.to_csv(OUTPUT_FILE, index=False)
print(f"Results saved to {OUTPUT_FILE}")

  from .autonotebook import tqdm as notebook_tqdm
INFO:__main__:Using device: cuda
INFO:__main__:Loading and splitting PDF...
INFO:__main__:Generating embeddings and creating vector store...
  embedding = HuggingFaceEmbeddings(model_name=EMBEDDINGS)
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:__main__:Loading Phi-2 model...
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 17.79it/s]
INFO:__main__:Model device: cuda:0
INFO:__main__:Loading queries from HW2_query.csv...
INFO:__main__:Processing queries...
INFO:__main__:Processing query ID: 0
INFO:__main__:Retrieved pages for query ID 0: [283, 415, 304, 292, 378]
INFO:__main__:Inputs device for query ID 0: cuda:0
INFO:__main__:Processing

Results saved to submission.csv
