# Vectorstores and Embeddings
Recall the overall workflow for retrieval augmented generation (RAG):

In [1]:
import sys
sys.path.append('../')
from library import *

In [2]:
openai.api_type

'azure'

In [19]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [20]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [21]:
splits = text_splitter.split_documents(docs)

In [22]:
len(splits)

209

## Embeddings

In [8]:
if is_azure_openai:
    embedding = openai.Embedding() 
else:
    from langchain.embeddings.openai import OpenAIEmbeddings
    embedding = OpenAIEmbeddings()

In [6]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [11]:
if is_azure_openai:
    embedding1 = embedding.create(deployment_id=embedding_deployment, input=sentence1) #embed_query(sentence1)
    embedding2 = embedding.create(deployment_id=embedding_deployment, input=sentence2)
    embedding3 = embedding.create(deployment_id=embedding_deployment, input=sentence3)
else:
    embedding1 = embedding.embed_query(sentence1)
    embedding2 = embedding.embed_query(sentence2)
    embedding3 = embedding.embed_query(sentence3)

In [12]:
import numpy as np

In [14]:
embedding1

<OpenAIObject list at 0x110f41490> JSON: {
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        -0.027539217844605446,
        -0.005422573070973158,
        -0.02572011388838291,
        -0.03309759125113487,
        -0.02726130001246929,
        0.022536681964993477,
        -0.01033983938395977,
        -0.008299663662910461,
        0.002488635713234544,
        -0.019896453246474266,
        0.0007046660757623613,
        0.02920673042535782,
        -0.005403623916208744,
        0.000600446539465338,
        0.00024752135504968464,
        0.014199119992554188,
        0.029964691027998924,
        -0.001301164855249226,
        0.004194045905023813,
        -0.003988765180110931,
        -0.01171680074185133,
        0.00699849845841527,
        0.013074812479317188,
        -0.04691773280501366,
        -0.0023559927940368652,
        0.004683562088757753,
        0.016927776858210564,
        -0.00026133834035135806

In [16]:
np.dot(embedding1.get('data')[0].get('embedding'), embedding2.get('data')[0].get('embedding'))

0.9632261952269926

In [None]:
np.dot(embedding1, embedding3)

In [None]:
np.dot(embedding2, embedding3)

## Vectorstores

In [None]:
# ! pip install chromadb

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = 'docs/chroma/'

In [None]:
!rm -rf ./docs/chroma  # remove old database files if any

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

### Similarity Search

In [None]:
question = "is there an email i can ask for help"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

In [None]:
vectordb.persist()

## Failure modes

This seems great, and basic similarity search will get you 80% of the way there very easily. 

But there are some failure modes that can creep up. 

Here are some edge cases that can arise - we'll fix them in the next class.

In [None]:
question = "what did they say about matlab?"

In [None]:
docs = vectordb.similarity_search(question,k=5)

Notice that we're getting duplicate chunks (because of the duplicate `MachineLearning-Lecture01.pdf` in the index).

Semantic search fetches all similar documents, but does not enforce diversity.

`docs[0]` and `docs[1]` are indentical.

In [None]:
docs[0]

In [None]:
docs[1]

We can see a new failure mode.

The question below asks a question about the third lecture, but includes results from other lectures as well.

In [None]:
question = "what did they say about regression in the third lecture?"

In [None]:
docs = vectordb.similarity_search(question,k=5)

In [None]:
for doc in docs:
    print(doc.metadata)

In [None]:
print(docs[4].page_content)

Approaches discussed in the next lecture can be used to address both!