# Split loaded documents using trained splitter model

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
from datetime import datetime
import os

from IPython.display import display, Markdown
import torch

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import ModelTextSplitter, MarkdownSyntacticEmbeddingSplitter
from models.split_utils import get_openai_embedder

In [3]:
# List all files in the new directory
split_path = "/home/public/iloveconference/split/model"
files_in_new = os.listdir(split_path)
files_in_new.sort()  # Sort files alphabetically
for file in files_in_new:
    print(file)

2023-04-11.pkl
2023-04-12.pkl
2023-09-24.pkl


In [4]:
# configure
input_path = '../data/split/conference/2025-09-16.jsonl'
split_model_path = '/home/public/iloveconference/split/model/2023-09-24.pkl'
output_dir = '../data/split/conference'
today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [6]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

True
cuda total 8139374592
cuda reserved 0
cuda allocated 0


## Load documents

In [7]:
docs = load_docs_from_jsonl(input_path)
len(docs)

136

## Create splits

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
# deprecated
text_splitter = ModelTextSplitter(split_model_path, split_threshold=0.55, chunk_size=500, anchor="anchor")
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


  0%|          | 0/136 [00:00<?, ?it/s]

1146

In [11]:
# 7x faster, but slightly lesser quality
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]
openai_embedder = get_openai_embedder(openai)

mse_splitter = MarkdownSyntacticEmbeddingSplitter(embedder=openai_embedder, split_threshold=0.80)
splits = mse_splitter.split_documents(docs, verbose=True)
len(splits)

  0%|          | 0/136 [00:00<?, ?it/s]

1087

In [12]:
%%time
mse_splitter = MarkdownSyntacticEmbeddingSplitter()
splits = mse_splitter.split_documents(docs, verbose=True)
len(splits)

  0%|          | 0/136 [00:00<?, ?it/s]

CPU times: user 7.39 s, sys: 304 ms, total: 7.7 s
Wall time: 3min 36s


1149

## Save splits

In [13]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)

In [14]:
print(len(docs), len(splits))

136 1149
