## Loading Hummingbot

We are going to use a Generic Loader and get the files from the scripts folder of the Hummingbot repository.
Also, we are using a LanguageParser to parse the files as Python code.

In [None]:
import os
import time
import sys

from dotenv import load_dotenv

# Assuming the root directory is one level up from 'research_notebooks'
root_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Path to 'custom_loaders'
loaders_path = os.path.join(root_path)

# Add 'custom_loaders' to the Python path
sys.path.append(loaders_path)
load_dotenv()

os.environ["OPENAI_API_KEY"]
# you should see the api key if the .env file is loaded correctly

In [None]:
from custom_loaders.mkdocs_site_loader import MkDocsSiteLoader
url = "https://hummingbot.org/"
sections_filter = ["blog", "release-notes", "botcamp", "academy", "academy-content", "exchanges", "chain"]
metadata_filter = ["related_paths", "related_urls"]

loader = MkDocsSiteLoader(site_url=url, sections_filter=sections_filter, metadata_filter=metadata_filter)

In [None]:
documents = loader.load()

In [None]:
[doc.metadata for doc in documents]

In [None]:
len(documents)

In [None]:
documents[0]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)
splitted_docs = r_splitter.split_documents([documents[0]])
len(splitted_docs)

In [None]:
splitted_docs[0].page_content

In [None]:
splitted_docs[1].page_content

In [None]:
splitted_docs[2].page_content

In [None]:
splitted_docs[3].page_content

In [None]:
splitted_docs[4].page_content

# Splitting the documents

Now we are going to split the documents using the MarkdownTextSplitter

In [None]:
chunk_size = 2000
chunk_overlap = 0
persist_directory = os.environ.get("PERSIST_DIRECTORY", "/tmp")

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", " ", ""]
)
splitted_documents = r_splitter.split_documents(documents)

In [None]:
len(splitted_documents)

# Creating the embeddings and store in chroma db


In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings

def split_list(input_list, size):
    return [input_list[i:i + size] for i in range(0, len(input_list), size)]


embedding = OpenAIEmbeddings(chunk_size=1000)

batch_size = 150
document_batches = split_list(splitted_documents, batch_size)

In [None]:
len(document_batches)

In [None]:
for batch in document_batches:
    vectordb = Chroma.from_documents(
        collection_name="hummingbot_documentation",
        collection_metadata={
            "Description": "This are the files from the Hummingbot docs."
        },
        documents=batch,
        embedding=embedding,
        persist_directory=persist_directory
    )
    time.sleep(5)

In [None]:
vectordb.similarity_search("how can I configure a pure market making strategy?")