In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"]  # you should see the api key if the .env file is loaded correctly

# Split Hummingbot Scripts

I'm going to use the same config of the previous notebook

In [2]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.document_loaders.parsers import LanguageParser
from langchain.document_loaders.generic import GenericLoader

repo_path = "/Users/dardonacci/Documents/work/hummingbot/scripts"  # path to the repo or folder of interest
chunk_size = 2000
chunk_overlap = 400


loader = GenericLoader.from_filesystem(
            path=repo_path,
            glob="**/*.py",
            suffixes=[".py"],
            parser=LanguageParser(language=Language.PYTHON),
        )
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

hummingbot_codebase = loader.load()
hummingbot_codebase_docs = python_splitter.split_documents(hummingbot_codebase)

In [3]:
len(hummingbot_codebase_docs)

299

# Learnings from Chroma

* The when you call Chroma.from_documents will create a new collection in the DB
* If you create an instance of Chroma, you can specify the collection name to only retrieve that one.
* You need an embedding function
* The persist directory should be the same for all
* If you call again Chroma.from_documents it's going to append more documents to the same collection
* If you want to start over you can call the method delete_collection()

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

persist_directory = "/Users/dardonacci/Documents/dardonacci/2-Code/metabrain/vector_stores/hummingbot/chroma/"

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    collection_name="hummingbot_scripts",
    collection_metadata={
        "Description": "This files are examples of the type of output that is requested when asking for writing scripts."
    },
    documents=hummingbot_codebase_docs,
    embedding=embedding,
    persist_directory=persist_directory
)

In [5]:
vectordb._collection

Collection(name=hummingbot_scripts)

In [6]:
# vectordb.delete_collection()

In [7]:
vectordb._collection.count()

582

In [101]:
vectordb._collection.metadata

{'Description': 'This files are examples of the type of output that is requested when asking for writing scripts.'}