# Split loaded documents

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import os

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_markdown import RecursiveMarkdownTextSplitter

In [None]:
# configure
input_path = '../data/load/fair/2023-11-15.jsonl'
chunk_size = 2000
chunk_overlap = 200
output_dir = '../data/split/fair/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
today = datetime.today().strftime('%Y-%m-%d')

## Load documents

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

## Create splits

In [None]:
text_splitter = RecursiveMarkdownTextSplitter(
    title_header_sbeparator=" / ",
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
)

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

## Save splits 

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)