# Split loaded documents

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from datetime import datetime
import os

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_markdown import RecursiveMarkdownTextSplitter

In [11]:
# configure
input_path = '../data/load/evidence_central/2023-11-14.jsonl'
chunk_size = 2000
chunk_overlap = 200
output_dir = '../data/split/evidence_central/'
today = datetime.today().strftime('%Y-%m-%d')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Load documents

In [12]:
docs = load_docs_from_jsonl(input_path)
len(docs)

5

## Create splits

In [13]:
text_splitter = RecursiveMarkdownTextSplitter(
    title_header_separator=" / ",
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)

In [14]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

100%|██████████| 5/5 [00:00<00:00, 924.79it/s]


52

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["url"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

## Save splits 

In [16]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)