In [None]:
pip install -q pypdf chromadb

# Ask Questions of A Document

# Let's Look at Embeddings and Vector Databases

Let's load a PDF from the file system and then ask question about it. In this case we'll use Embeddings from OpenAI and a local VectorDatbase called ChromaDB

![Image Description](embedding1.png)
![Image Description](embedding2.png)
![Image Description](embedding3.png)



In [None]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

# load the PDF document
loader = PyPDFLoader("constitution.pdf")
documents = loader.load()

# split the documents into chunks
# The chunk size and chunk overlap parameters can be used to control the granularity of the text splitting.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) # specify how many characters per chunk and overlap
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# Create embeddings for each chunk and insert into the Chroma vector database.
db = Chroma.from_documents(texts, embeddings)

# Create a language model and a retriever
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
retriever = db.as_retriever()

# Create a QA chain
chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type='stuff')

The chain is now all set. Let's ask some questions!

In [None]:
#query = "What is the age required to be president?"
#query = "What is the title of this document?"
#query = "Who signed this document?"
query = "What is the first amendment?"
chain.run(query)