In [None]:
%pip install trl pydantic datasets peft bitsandbytes

In [7]:
from transformers import AutoTokenizer
from llm_training import chunk_texts

# Load OLMO2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-7B", trust_remote_code=True)

# Sample texts to test chunking
sample_texts = [
    "This is the first document. It contains some text that we want to chunk based on token length. Let's see how the chunking function works with this content.",
    "Here's a second document with different content. This one is also going to be processed by our chunking function to demonstrate how it handles multiple texts.",
    "A third and final document to complete our test. This will help us verify that the function properly handles a list of texts and chunks them appropriately."
]

# Test the chunk_texts function with a small context length
context_length = 20  # Small context length to force chunking
all_chunks, total_tokens = chunk_texts(sample_texts, tokenizer, context_length)

print(f"Original texts: {len(sample_texts)}")
print(f"Total tokens: {total_tokens}")
print(f"Context length: {context_length}")
print(f"Generated chunks: {len(all_chunks)}")
print("\nChunks:")
for i, chunk in enumerate(all_chunks):
    chunk_tokens = len(tokenizer(chunk, add_special_tokens=False)["input_ids"])
    print(f"Chunk {i+1} ({chunk_tokens} tokens): {chunk}")


Original texts: 3
Total tokens: 95
Context length: 20
Generated chunks: 6

Chunks:
Chunk 1 (20 tokens): This is the first document. It contains some text that we want to chunk based on token length.
Chunk 2 (14 tokens):  Let's see how the chunking function works with this content.<|endoftext|>
Chunk 3 (20 tokens): Here's a second document with different content. This one is also going to be processed by our chunk
Chunk 4 (11 tokens): ing function to demonstrate how it handles multiple texts.<|endoftext|>
Chunk 5 (20 tokens): A third and final document to complete our test. This will help us verify that the function properly handles
Chunk 6 (10 tokens):  a list of texts and chunks them appropriately.<|endoftext|>
