# QA over PDF document with Vector DB

In [None]:
import os
import openai
from IPython.display import display, HTML, Markdown
from pprint import pprint

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.callbacks import OpenAICallbackHandler

totals_cb = OpenAICallbackHandler()

print(totals_cb)

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI

papers=[
    "https://arxiv.org/pdf/2304.00612.pdf", 
    "https://arxiv.org/pdf/2305.04091.pdf", 
    "https://arxiv.org/pdf/2307.10195.pdf", 
    "https://arxiv.org/pdf/2210.03629.pdf", 
    "https://arxiv.org/pdf/2307.10169.pdf", 
]

loader = PyPDFLoader(papers[0])

pages = loader.load()

print(f"Document has {len(pages)} pages\n")

llm = ChatOpenAI()

total_tokens = 0
for n, page in enumerate(pages):
    tokens = llm.get_num_tokens(page.page_content)
    total_tokens += tokens
    print(f"Page {n+1:2d}: {tokens:>}")
    
print(f"\nTotal number of tokens in document: {total_tokens}")

### Check OpenAI Pricing for Embedding models

https://openai.com/pricing

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

db = Chroma.from_documents(pages, OpenAIEmbeddings())

![%E2%80%8Eimages.%E2%80%8E001.jpeg](attachment:%E2%80%8Eimages.%E2%80%8E001.jpeg)

In [None]:
doc_with_embeddings = db.get(offset=0, limit=1, include=["documents", "embeddings"])

print(doc_with_embeddings["documents"][0][:200])

emb = doc_with_embeddings["embeddings"][0]

print(f"\nLength of embeddings: {len(emb)}")
pprint(emb[:50])

In [None]:
from langchain.chains import RetrievalQA

retriever = db.as_retriever(search_kwargs={"k": 2})

qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0.0), 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

![%E2%80%8Eimages.%E2%80%8E002.jpeg](attachment:%E2%80%8Eimages.%E2%80%8E002.jpeg)

In [None]:
query = "Tell me more about the emergence of unpredictable behaviour in LLMs"

response = qa_chain(query, callbacks=[totals_cb])

In [None]:
print(response["result"])

In [None]:
query = "What are the challenges in interpreting LLMs?"

response = qa_chain(query, callbacks=[totals_cb])

print(response["result"])

In [None]:
pprint(response["source_documents"])

In [None]:
pprint(totals_cb)