# Setup
Load initial libraries and config to get things up an running.

In [1]:
import os
import openai

openai.api_key  = os.environ['OPENAI_API_KEY']

## Parse HTML

In [2]:
from langchain.document_loaders import BSHTMLLoader

documents = []

for file in os.listdir("../html"):
    loader = BSHTMLLoader(os.path.join("../html", file))
    data = loader.load()
    documents.append(data[0])


## Split Documents
Split the document into sentences.

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 15,
    separators=["\n\n", "\n", " ", ""]
)

sentences = text_splitter.split_documents(documents)


## Store Sentences
Store the sentences into a vector database. This will allow us to quickly find similar sentences.

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = "../vectors/"

# Remove old database files if any
!rm -rf ../vectors

embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=sentences,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

5041


# Test!
A quick test to see if the vectors are loaded correctly.

In [15]:
question = "Can a baby have water?"
response_documents = vectordb.max_marginal_relevance_search(question, k=3)

response_documents[0].page_content[:150]

'When Can Babies Have Water? — Malina Malkani\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n0\n\n\n\n\n\n\n\n\n\n      Skip to Con'

In [16]:
response_documents[1].page_content[:150]

'Offering a little bit of water at meals has other benefits too. It helps facilitate swallowing, and the washing down of pocketed food.\xa0\nSome babies ca'

In [17]:
response_documents[2].page_content[:150]

"Babies younger than 1-year-old can get water intoxication with too much water. It's rare but dangerous for infants, especially those younger than 6 mo"