# Vectorstores and Embeddings

Recall the overall workflow for retrieval augmented generation

![](rag.jpeg)

In [1]:
import openai

In [2]:
from langchain.document_loaders import PyPDFLoader

In [3]:
# Load PDFs
loaders = [
    # Duplicate documents on purpose - dump data
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/cs229_lectures/MachineLearning-Lecture03.pdf")
]

In [4]:
docs = []
for loader in loaders:
    docs.extend(loader.load())


In [5]:
# Split the documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_spliter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [6]:
splits = text_spliter.split_documents(docs)

In [7]:
len(splits)

209

## Embeddings

Let's embed the splits data

In [8]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

In [9]:
# Example sentences
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugle outside"

In [10]:
# Create en embedding for each sentence
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [11]:
import numpy as np

In [12]:
np.dot(embedding1, embedding2)

0.9631227500523606

In [13]:
np.dot(embedding1, embedding3)

0.7838640810382902

In [14]:
np.dot(embedding2, embedding3)

0.7770751441729357

## Vectorestores

In [15]:
%pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [16]:
from langchain.vectorstores import Chroma

In [17]:
persist_directory = 'docs/chroma'

In [18]:
!rm -rf ./docs/chroma  # Make sure nothing is there already by removing old database files if any

In [19]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [20]:
print(vectordb._collection.count())  # Equals to len(splits)

209


### Similarity Search

In [21]:
question = "is there an email i can ask for help"

In [22]:
docs = vectordb.similarity_search(
    question,
    k=3  # The number of documents we want
)

In [23]:
len(docs)

3

In [24]:
print(docs[0].page_content)

cs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So 
rather than sending us email individually, if you send email to this account, it will 
actually let us get back to you maximally quickly with answers to your questions.  
If you're asking questions about homework probl ems, please say in the subject line which 
assignment and which question the email refers to, since that will also help us to route 
your question to the appropriate TA or to me  appropriately and get the response back to 
you quickly.  
Let's see. Skipping ahead — let's see — for homework, one midterm, one open and term 
project. Notice on the honor code. So one thi ng that I think will help you to succeed and 
do well in this class and even help you to enjoy this cla ss more is if you form a study 
group.  
So start looking around where you' re sitting now or at the end of class today, mingle a 
little bit and get to know your classmates. I strongly encourage you to form study gro

In [25]:
# Persist the vector database (so it could be used it the future)
vectordb.persist()