# Split loaded documents using trained splitter model

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

from IPython.display import display, Markdown
import torch

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import ModelTextSplitter, MarkdownSyntacticEmbeddingSplitter
from models.split_model_train import get_openai_embedder

In [None]:
# configure
input_path = '../data/load/output/2023-09-23.jsonl'
split_model_path = '../data/split/model/2023-09-24.pkl'
split_threshold = 0.55
chunk_size = 500
anchor = "anchor"
output_dir = '../data/split/output'
today = datetime.today().strftime('%Y-%m-%d')

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load documents

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

## Create splits

In [None]:
%%time
mse_splitter = MarkdownSyntacticEmbeddingSplitter(split_threshold=0.83, max_chars=2000, anchor=anchor)
splits = mse_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
%%time
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]
openai_embedder = get_openai_embedder(openai)

mse_splitter = MarkdownSyntacticEmbeddingSplitter(embedder=openai_embedder, split_threshold=0.80, max_chars=2000, anchor=anchor)
splits = mse_splitter.split_documents(docs, verbose=True)
len(splits)

## Save splits 

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)