# Split loaded documents using trained splitter model

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

from IPython.display import display, Markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import ModelTextSplitter

In [None]:
# configure
input_path = '../data/load/dc_places/2023-11-02.jsonl'
chunk_size = 1000
chunk_overlap = 100
output_dir = '../data/split/dc_places/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
today = datetime.today().strftime('%Y-%m-%d')
length_function = len

## Load documents

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

## Create splits

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
    Language.MARKDOWN,
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap,
    length_function=length_function,
)

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema.document import Document

headers_to_split_on = [
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

splits = []
for doc in docs:
    sections = markdown_splitter.split_text(doc.page_content)
    for section in sections:
        metadata = doc.metadata.copy()
        metadata['title'] += " / " + section.metadata["Header 2"]
        splits.append(Document(
            metadata=metadata,
            page_content= "## " + section.metadata['Header 2'] + "\n\n" + section.page_content))

In [None]:
splits = text_splitter.split_documents(splits)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

## Save splits 

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)