# Split loaded documents using trained splitter model

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

from IPython.display import display, Markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import ModelTextSplitter

In [None]:
# configure
input_path = '../data/load/dc_people/2023-10-26.jsonl'
chunk_size = 2000
chunk_overlap = 200
output_dir = '../data/split/dc_people/'
today = datetime.today().strftime('%Y-%m-%d')
length_function = len
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Load documents

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

## Create splits

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
    Language.MARKDOWN,
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function=length_function,
)
splits = text_splitter.split_documents(docs)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["url"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

## Save splits 

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)