# Setup
Load initial libraries and config to get things up an running.

In [None]:
import os
import openai

openai.api_key  = os.environ['OPENAI_API_KEY']

In [None]:
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from typing import Dict, Union
import unicodedata

def parse_html(file_path: str) -> Document:
    with open(file_path, "r") as file:
        soup = BeautifulSoup(file, "lxml")

    blog_content = soup.find_all("div", class_="blog-item-content")[0].get_text(separator=u' ', strip=True)
    text = unicodedata.normalize("NFKC", blog_content)
    
    metadata: Dict[str, Union[str, None]] = {
        "source": file_path,
        "title": str(soup.title.string),
    }

    return Document(page_content=text, metadata=metadata)

documents = []

for file in os.listdir("../html"):
    document = parse_html(os.path.join("../html", file))
    documents.append(document)

## Split Documents
Split the document into sentences.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

sentences = text_splitter.split_documents(documents)

## Store Sentences
Store the sentences into a vector database. This will allow us to quickly find similar sentences.

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

# Remove old database files if any
!rm -rf ../vectors

vectordb = Chroma.from_documents(
    documents=sentences,
    embedding=OpenAIEmbeddings(),
    persist_directory="../vectors/"
)

print(vectordb._collection.count())

# Test!
A quick test to see if the vectors are loaded correctly.

In [None]:
# Only run this step if you want to load the vectorized documents from disk
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

vectordb = Chroma(
    persist_directory="../vectors/",
    embedding_function=OpenAIEmbeddings()
)

In [None]:
# question = "Can I give my baby water?"
#uestion = "What is the best yogurt for my baby?"
question = "How do I serve a banana?"
response_documents = vectordb.max_marginal_relevance_search(question, k=3)

response_documents[0]

In [None]:
response_documents[1].page_content

In [None]:
response_documents[2].page_content