# Analyze Vector DB (Just for fun)

- We will see how many text chunks are in the database
- We will run a few similarity searches to see how the retrieval process works

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

## Specify the embedding model and vector DB

In [None]:
embedding_model = HuggingFaceEmbeddings()
database_loc = ("./chroma_db_test1")

vectorstore = Chroma(persist_directory=database_loc,
      embedding_function=embedding_model)

### (optional) Print the contents

In [None]:
all_docs = vectorstore.get()['documents']

print(f"docs: {len(all_docs)}")

## Run a similarity search

In [None]:
from typing import List
from langchain_core.runnables import chain
from langchain_core.documents import Document

@chain
def retriever(query: str) -> List[Document]:
    
    docs, scores = zip(*vectorstore.similarity_search_with_score(query, k=6))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

def get_matches(phrase: str):
    embedding = HuggingFaceEmbeddings().embed_query(phrase)
    docs = retriever.invoke(phrase)
    for doc in docs:
        print(f"page {doc.metadata['page']}")
    return docs

In [None]:
# phrase = "I am interested in building homes"
# phrase = "I want to be an ambassodor to Japan"
# phrase = "My goal is to find cure for cancer"

In [None]:
get_matches(phrase)