In [None]:
import json
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load documents

In [None]:
ROOT = Path.cwd().parent.parent.resolve()
processed_data_path = ROOT / "data" / "processed"

all_documents_filename = processed_data_path / "all_documents.jsonl"

with open(all_documents_filename, "r", encoding="utf-8") as f:
    all_documents = [json.loads(line) for line in f]

print(f"Loaded {len(all_documents)} documents.")

In [None]:
all_documents[2900]

### Split data

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

In [None]:
all_chunks = []
for doc in all_documents:
    chunks = text_splitter.create_documents(
        [doc['text']],
        metadatas=[doc['metadata']]
    )
    for chunk in chunks:
        all_chunks.append(chunk)

print(f"Split {len(all_documents)} documents into {len(all_chunks)} chunks.")


In [None]:
sample_chunk = all_chunks[1000]
print(f"Content: \\n{sample_chunk.page_content}")
print(f"\\nMetadata: \\n{sample_chunk.metadata}")

### Save file

In [None]:
serializable_chunks = []

for chunk in all_chunks:
    serializable_chunks.append({
        "page_content": chunk.page_content,
        "metadata": chunk.metadata
    })

chunks_file_path = processed_data_path / "all_chunks.jsonl"
with open(chunks_file_path, "w", encoding="utf-8") as f:
    for chunk in serializable_chunks:
        f.write(json.dumps(chunk) + "\n")

print(f"Successfully saved {len(serializable_chunks)} chunks to {chunks_file_path}")