# QA with Pandas User Guide - Build Persistent Vector Database

In [None]:
import os
import openai
from IPython.display import display, HTML, Markdown
from pprint import pprint

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.callbacks import OpenAICallbackHandler

totals_cb = OpenAICallbackHandler()

print(totals_cb)

In [None]:
!rm -f ./pandas.zip
!rm -rf ./pandas_docs

In [None]:
import wget

url='https://pandas.pydata.org/docs/pandas.zip'

wget.download(url)

from zipfile import ZipFile

with ZipFile("pandas.zip", 'r') as zObject:
    zObject.extractall("pandas_docs")

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('pandas_docs/user_guide', glob="**/*.html", show_progress=True)

docs = loader.load()

print(f"Loaded {len(docs)} docs")

In [None]:
from langchain.text_splitter import TokenTextSplitter

token_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=50)

chunks = token_splitter.split_documents(docs)

print(f"Documents split into {len(chunks)} chunks\n")

In [None]:
!rm -rf ./chroma_db/pandas

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")

db = Chroma.from_documents(
    chunks, 
    embedding=embeddings,
    persist_directory="./chroma_db/pandas"
)

In [None]:
doc_with_embeddings = db.get(offset=0, limit=1, include=["documents", "embeddings"])

print(doc_with_embeddings["documents"][0][:200])

emb = doc_with_embeddings["embeddings"][0]

print(f"\nLength of embeddings: {len(emb)}")
pprint(emb[:50])

In [None]:
db.persist()