## Loading Hummingbot

We are going to use a Generic Loader and get the files from the scripts folder of the Hummingbot repository.
Also, we are using a LanguageParser to parse the files as Python code.

In [69]:
import os
import CONFIG

os.environ["OPENAI_API_KEY"] = CONFIG.OPENAI_API_KEY

In [70]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.document_loaders.parsers import LanguageParser
from langchain.document_loaders.generic import GenericLoader

repo_path = "/Users/dardonacci/Documents/work/hummingbot/scripts"

In [71]:
loader = GenericLoader.from_filesystem(
            path=repo_path,
            glob="**/*.py",
            suffixes=[".py"],
            parser=LanguageParser(language=Language.PYTHON),
        )

In [72]:
hummingbot_codebase = loader.load()

In [73]:
len(hummingbot_codebase)

108

In [74]:
files = set([doc.metadata["source"] for doc in hummingbot_codebase])
content_types = set([doc.metadata["content_type"] for doc in hummingbot_codebase])
len(files)

52

In [75]:
content_types

{'functions_classes', 'simplified_code'}

In [76]:
type(hummingbot_codebase[0])

langchain.schema.document.Document

In [16]:
from pprint import pprint

pprint(hummingbot_codebase[0].page_content)

('class MarketMakingDmanComposed(ScriptStrategyBase):\n'
 '    trading_pairs = ["XRP-USDT"]\n'
 '    position_side_by_trading_pair = {\n'
 '        "XRP-USDT": TradeType.SELL,\n'
 '    }\n'
 '    leverage = 10\n'
 '    triple_barrier_conf = TripleBarrierConf(\n'
 '        stop_loss=Decimal("0.01"), take_profit=Decimal("0.03"),\n'
 '        time_limit=60 * 60 * 6,\n'
 '        trailing_stop_activation_price_delta=Decimal("0.008"),\n'
 '        trailing_stop_trailing_delta=Decimal("0.004"),\n'
 '        open_order_type=OrderType.MARKET\n'
 '    )\n'
 '\n'
 '    order_levels = [\n'
 '        OrderLevel(level=0, side=TradeType.BUY, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '        OrderLevel(level=0, side=TradeType.SELL, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '   

In [52]:
hummingbot_codebase[0].metadata

{'source': '/Users/dardonacci/Documents/work/hummingbot/scripts/carlito_tf_v1_multiple_pairs.py',
 'content_type': 'functions_classes',
 'language': <Language.PYTHON: 'python'>}

# Splitting one document

Now we are going to split the documents using the RecursiveCharacterTextSplitter from language Python and compare the document before and after the splitting.

In [53]:
chunk_size = 1000
chunk_overlap = 100
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

In [55]:
doc_1_splitted = python_splitter.split_documents([hummingbot_codebase[0]])

In [56]:
len(doc_1_splitted)

4

In [57]:
pprint(doc_1_splitted[0].page_content)

('class MarketMakingDmanComposed(ScriptStrategyBase):\n'
 '    trading_pairs = ["XRP-USDT"]\n'
 '    position_side_by_trading_pair = {\n'
 '        "XRP-USDT": TradeType.SELL,\n'
 '    }\n'
 '    leverage = 10\n'
 '    triple_barrier_conf = TripleBarrierConf(\n'
 '        stop_loss=Decimal("0.01"), take_profit=Decimal("0.03"),\n'
 '        time_limit=60 * 60 * 6,\n'
 '        trailing_stop_activation_price_delta=Decimal("0.008"),\n'
 '        trailing_stop_trailing_delta=Decimal("0.004"),\n'
 '        open_order_type=OrderType.MARKET\n'
 '    )\n'
 '\n'
 '    order_levels = [\n'
 '        OrderLevel(level=0, side=TradeType.BUY, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '        OrderLevel(level=0, side=TradeType.SELL, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '   

In [58]:
pprint(doc_1_splitted[1].page_content)

('for trading_pair in trading_pairs:\n'
 '        config = CarlitoTFV1Config(\n'
 '            exchange="binance_perpetual",\n'
 '            trading_pair=trading_pair,\n'
 '            order_levels=order_levels,\n'
 '            candles_config=[\n'
 '                CandlesConfig(connector="binance_perpetual", '
 'trading_pair=trading_pair, interval="3m", max_records=100),\n'
 '            ],\n'
 '            leverage=leverage,\n'
 '            sma_fast=5,\n'
 '            sma_slow=10,\n'
 '            '
 'position_side_allowed=position_side_by_trading_pair[trading_pair],\n'
 '        )\n'
 '        controller = CarlitoTFV1(config=config)\n'
 '        markets = controller.update_strategy_markets_dict(markets)\n'
 '        controllers[trading_pair] = controller\n'
 '\n'
 '    def __init__(self, connectors: Dict[str, ConnectorBase]):\n'
 '        super().__init__(connectors)\n'
 '        for trading_pair, controller in self.controllers.items():\n'
 '            self.executor_handlers[tr

In [59]:
pprint(doc_1_splitted[2].page_content)

('def on_stop(self):\n'
 '        for executor_handler in self.executor_handlers.values():\n'
 '            executor_handler.stop()\n'
 '\n'
 '    def on_tick(self):\n'
 '        """\n'
 '        This shows you how you can start meta controllers. You can run more '
 'than one at the same time and based on the\n'
 '        market conditions, you can orchestrate from this script when to stop '
 'or start them.\n'
 '        """\n'
 '        for executor_handler in self.executor_handlers.values():\n'
 '            if executor_handler.status == '
 'ExecutorHandlerStatus.NOT_STARTED:\n'
 '                executor_handler.start()')


In [62]:
pprint(doc_1_splitted[3].page_content)

('def format_status(self) -> str:\n'
 '        if not self.ready_to_trade:\n'
 '            return "Market connectors are not ready."\n'
 '        lines = []\n'
 '        for trading_pair, executor_handler in '
 'self.executor_handlers.items():\n'
 '            if executor_handler.controller.all_candles_ready:\n'
 '                lines.extend(\n'
 '                    [f"Strategy: '
 '{executor_handler.controller.config.strategy_name} | Trading Pair: '
 '{trading_pair}",\n'
 '                     executor_handler.to_format_status()])\n'
 '        return "\\n".join(lines)')


# Splitting all the documents

We are going to apply the split all documents and see how many documents we get.

In [28]:
all_documents = python_splitter.split_documents(hummingbot_codebase)

In [29]:
len(all_documents)

520

In [30]:
pprint(all_documents[0].page_content)

('class MarketMakingDmanComposed(ScriptStrategyBase):\n'
 '    trading_pairs = ["XRP-USDT"]\n'
 '    position_side_by_trading_pair = {\n'
 '        "XRP-USDT": TradeType.SELL,\n'
 '    }\n'
 '    leverage = 10\n'
 '    triple_barrier_conf = TripleBarrierConf(\n'
 '        stop_loss=Decimal("0.01"), take_profit=Decimal("0.03"),\n'
 '        time_limit=60 * 60 * 6,\n'
 '        trailing_stop_activation_price_delta=Decimal("0.008"),\n'
 '        trailing_stop_trailing_delta=Decimal("0.004"),\n'
 '        open_order_type=OrderType.MARKET\n'
 '    )\n'
 '\n'
 '    order_levels = [\n'
 '        OrderLevel(level=0, side=TradeType.BUY, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '        OrderLevel(level=0, side=TradeType.SELL, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '   

In [31]:
pprint(all_documents[1].page_content)

('for trading_pair in trading_pairs:\n'
 '        config = CarlitoTFV1Config(\n'
 '            exchange="binance_perpetual",\n'
 '            trading_pair=trading_pair,\n'
 '            order_levels=order_levels,\n'
 '            candles_config=[\n'
 '                CandlesConfig(connector="binance_perpetual", '
 'trading_pair=trading_pair, interval="3m", max_records=100),\n'
 '            ],\n'
 '            leverage=leverage,\n'
 '            sma_fast=5,\n'
 '            sma_slow=10,\n'
 '            '
 'position_side_allowed=position_side_by_trading_pair[trading_pair],\n'
 '        )\n'
 '        controller = CarlitoTFV1(config=config)\n'
 '        markets = controller.update_strategy_markets_dict(markets)\n'
 '        controllers[trading_pair] = controller\n'
 '\n'
 '    def __init__(self, connectors: Dict[str, ConnectorBase]):\n'
 '        super().__init__(connectors)\n'
 '        for trading_pair, controller in self.controllers.items():\n'
 '            self.executor_handlers[tr

In [32]:
pprint(all_documents[2].page_content)

('def on_stop(self):\n'
 '        for executor_handler in self.executor_handlers.values():\n'
 '            executor_handler.stop()\n'
 '\n'
 '    def on_tick(self):\n'
 '        """\n'
 '        This shows you how you can start meta controllers. You can run more '
 'than one at the same time and based on the\n'
 '        market conditions, you can orchestrate from this script when to stop '
 'or start them.\n'
 '        """\n'
 '        for executor_handler in self.executor_handlers.values():\n'
 '            if executor_handler.status == '
 'ExecutorHandlerStatus.NOT_STARTED:\n'
 '                executor_handler.start()')


In [33]:
pprint(all_documents[3].page_content)

('def format_status(self) -> str:\n'
 '        if not self.ready_to_trade:\n'
 '            return "Market connectors are not ready."\n'
 '        lines = []\n'
 '        for trading_pair, executor_handler in '
 'self.executor_handlers.items():\n'
 '            if executor_handler.controller.all_candles_ready:\n'
 '                lines.extend(\n'
 '                    [f"Strategy: '
 '{executor_handler.controller.config.strategy_name} | Trading Pair: '
 '{trading_pair}",\n'
 '                     executor_handler.to_format_status()])\n'
 '        return "\\n".join(lines)')


# Testing different values to split the code

## Same Chunk Size and different overlap

In [85]:
splitter_1 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=50
)
splitter_2 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=100
)
splitter_3 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=300,
)

In [86]:
all_documents_1 = splitter_1.split_documents(hummingbot_codebase)
all_documents_2 = splitter_2.split_documents(hummingbot_codebase)
all_documents_3 = splitter_3.split_documents(hummingbot_codebase)

In [87]:
print(f"Splitter 1: {len(all_documents_1)}"
      f"\nSplitter 2: {len(all_documents_2)}"
      f"\nSplitter 3: {len(all_documents_3)}")

Splitter 1: 519
Splitter 2: 520
Splitter 3: 541


## Same Overlap and different chunk size

In [89]:
splitter_1 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=1000, chunk_overlap=200
)
splitter_2 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
splitter_3 = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=5000, chunk_overlap=200,
)

In [91]:
all_documents_1 = splitter_1.split_documents(hummingbot_codebase)
all_documents_2 = splitter_2.split_documents(hummingbot_codebase)
all_documents_3 = splitter_3.split_documents(hummingbot_codebase)

In [92]:
print(f"Splitter 1: {len(all_documents_1)}"
      f"\nSplitter 2: {len(all_documents_2)}"
      f"\nSplitter 3: {len(all_documents_3)}")

Splitter 1: 530
Splitter 2: 277
Splitter 3: 150


# Conclusion

We can see that the loader with the parser of Python is working as expected, for each file we have a simplified code version and the function classes are separated from the rest of the code.
Also, we can see that the splitter is working as expected, we can split the code in different ways and we can see that the number of files generated is different.

Techniques conclusion:
* The original load generates 108 files.
* The splitter with chunk_size=1000 and chunk_overlap=100 generates 520 files.
* Changing the overlap does not affect so much the quantity of docs as the chunk size
* With chunk size of 2000 we get 277 files and I think that they are well distributed

In [96]:
pprint(all_documents_2[0].page_content)

('class MarketMakingDmanComposed(ScriptStrategyBase):\n'
 '    trading_pairs = ["XRP-USDT"]\n'
 '    position_side_by_trading_pair = {\n'
 '        "XRP-USDT": TradeType.SELL,\n'
 '    }\n'
 '    leverage = 10\n'
 '    triple_barrier_conf = TripleBarrierConf(\n'
 '        stop_loss=Decimal("0.01"), take_profit=Decimal("0.03"),\n'
 '        time_limit=60 * 60 * 6,\n'
 '        trailing_stop_activation_price_delta=Decimal("0.008"),\n'
 '        trailing_stop_trailing_delta=Decimal("0.004"),\n'
 '        open_order_type=OrderType.MARKET\n'
 '    )\n'
 '\n'
 '    order_levels = [\n'
 '        OrderLevel(level=0, side=TradeType.BUY, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '        OrderLevel(level=0, side=TradeType.SELL, '
 'order_amount_usd=Decimal("20"), order_refresh_time=60 * 5,\n'
 '                   cooldown_time=15, '
 'triple_barrier_conf=triple_barrier_conf),\n'
 '   