# Setup
Load initial libraries and config to get things up an running.

In [None]:
import os
import openai

openai.api_key  = os.environ['OPENAI_API_KEY']

## Parse HTML

In [None]:
from langchain.document_loaders import BSHTMLLoader

documents = []

for file in os.listdir("../html"):
    loader = BSHTMLLoader(os.path.join("../html", file))
    data = loader.load()
    documents.append(data[0])


## Split Documents
Split the document into sentences.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 15,
    separators=["\n\n", "\n", " ", ""]
)

sentences = text_splitter.split_documents(documents)


## Store Sentences
Store the sentences into a vector database. This will allow us to quickly find similar sentences.

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

# Remove old database files if any
!rm -rf ../vectors

vectordb = Chroma.from_documents(
    documents=sentences,
    embedding=OpenAIEmbeddings(),
    persist_directory="../vectors/"
)

print(vectordb._collection.count())

# Test!
A quick test to see if the vectors are loaded correctly.

In [None]:
# Only run this step if you want to load the vectorized documents from disk
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

vectordb = Chroma(
    persist_directory="../vectors/",
    embedding_function=OpenAIEmbeddings()
)

In [None]:
question = "Can a baby have water?"
response_documents = vectordb.max_marginal_relevance_search(question, k=3)

response_documents[0].page_content[:150]

In [None]:
response_documents[1].page_content[:150]

In [None]:
response_documents[2].page_content[:150]