feat: new splitByTokens function

johannschopplich · johannschopplich · commit b9db65eb74f3 · 2025-10-16T08:07:37.000+02:00
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ yarn add tokenx
 ## Usage
 
 ```ts
-import { estimateTokenCount, isWithinTokenLimit } from 'tokenx'
+import { estimateTokenCount, isWithinTokenLimit, splitByTokens } from 'tokenx'
 
 const text = 'Your text goes here.'
 
@@ -61,6 +61,10 @@ const tokenLimit = 1024
 const withinLimit = isWithinTokenLimit(text, tokenLimit)
 console.log(`Is within token limit: ${withinLimit}`)
 
+// Split text into token-based chunks
+const chunks = splitByTokens(text, 100)
+console.log(`Split into ${chunks.length} chunks`)
+
 // Use custom options for different languages or models
 const customOptions = {
   defaultCharsPerToken: 4, // More conservative estimation
@@ -183,6 +187,54 @@ function sliceByTokens(
 
 The sliced text portion corresponding to the specified token range.
 
+### `splitByTokens`
+
+Splits text into chunks based on token count. Useful for chunking documents for RAG, batch processing, or staying within context windows.
+
+**Usage:**
+
+```ts
+const text = 'Long text that needs to be split into smaller chunks...'
+
+// Basic splitting
+const chunks = splitByTokens(text, 100)
+console.log(`Split into ${chunks.length} chunks`)
+
+// With overlap for semantic continuity
+const overlappedChunks = splitByTokens(text, 100, { overlap: 10 })
+
+// With custom options
+const customChunks = splitByTokens(text, 50, {
+  defaultCharsPerToken: 4,
+  overlap: 5
+})
+```
+
+**Type Declaration:**
+
+```ts
+interface SplitByTokensOptions extends TokenEstimationOptions {
+  /** Number of tokens to overlap between consecutive chunks (default: 0) */
+  overlap?: number
+}
+
+function splitByTokens(
+  text: string,
+  tokensPerChunk: number,
+  options?: SplitByTokensOptions
+): string[]
+```
+
+**Parameters:**
+
+- `text` - The input text to split
+- `tokensPerChunk` - Maximum number of tokens per chunk
+- `options` - Token estimation options with optional overlap
+
+**Returns:**
+
+An array of text chunks, each containing approximately `tokensPerChunk` tokens.
+
 ## License
 
 [MIT](./LICENSE) License © 2023-PRESENT [Johann Schopplich](https://github.com/johannschopplich)
diff --git a/src/index.ts b/src/index.ts
@@ -1,4 +1,4 @@
-import type { LanguageConfig, TokenEstimationOptions } from './types'
+import type { LanguageConfig, SplitByTokensOptions, TokenEstimationOptions } from './types'
 
 export * from './types'
 
@@ -109,6 +109,66 @@ export function sliceByTokens(
   return parts.join('')
 }
 
+/**
+ * Splits text into chunks based on token count.
+ */
+export function splitByTokens(
+  text: string,
+  tokensPerChunk: number,
+  options: SplitByTokensOptions = {},
+): string[] {
+  if (!text || tokensPerChunk <= 0)
+    return []
+
+  const {
+    defaultCharsPerToken = DEFAULT_CHARS_PER_TOKEN,
+    languageConfigs = DEFAULT_LANGUAGE_CONFIGS,
+    overlap = 0,
+  } = options
+
+  const segments = text.split(TOKEN_SPLIT_PATTERN).filter(Boolean)
+  const chunks: string[] = []
+  let currentChunk: string[] = []
+  let currentTokenCount = 0
+
+  for (const segment of segments) {
+    const tokenCount = estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken)
+
+    currentChunk.push(segment)
+    currentTokenCount += tokenCount
+
+    if (currentTokenCount >= tokensPerChunk) {
+      chunks.push(currentChunk.join(''))
+
+      // Calculate overlap for next chunk
+      if (overlap > 0) {
+        const overlapSegments: string[] = []
+        let overlapTokenCount = 0
+
+        for (let i = currentChunk.length - 1; i >= 0 && overlapTokenCount < overlap; i--) {
+          const segmentValue = currentChunk[i]!
+          const tokCount = estimateSegmentTokens(segmentValue, languageConfigs, defaultCharsPerToken)
+          overlapSegments.unshift(segmentValue)
+          overlapTokenCount += tokCount
+        }
+
+        currentChunk = overlapSegments
+        currentTokenCount = overlapTokenCount
+      }
+      else {
+        currentChunk = []
+        currentTokenCount = 0
+      }
+    }
+  }
+
+  // Add remaining content as last chunk
+  if (currentChunk.length > 0)
+    chunks.push(currentChunk.join(''))
+
+  return chunks
+}
+
 function estimateSegmentTokens(
   segment: string,
   languageConfigs: LanguageConfig[],
diff --git a/src/types.ts b/src/types.ts
@@ -17,3 +17,11 @@ export interface LanguageConfig {
   /** Average number of characters per token for this language */
   averageCharsPerToken: number
 }
+
+/**
+ * Configuration options for splitting text by tokens
+ */
+export interface SplitByTokensOptions extends TokenEstimationOptions {
+  /** Number of tokens to overlap between consecutive chunks (default: 0) */
+  overlap?: number
+}
diff --git a/test/index.test.ts b/test/index.test.ts
@@ -6,6 +6,7 @@ import {
   estimateTokenCount,
   isWithinTokenLimit,
   sliceByTokens,
+  splitByTokens,
 } from '../src/index'
 
 const fixturesDir = fileURLToPath(new URL('fixtures', import.meta.url))
@@ -139,4 +140,36 @@ describe('token-related functions', () => {
       expect(sliceByTokens(GERMAN_TEXT, -1000)).toBe(GERMAN_TEXT)
     })
   })
+
+  describe('splitByTokens', () => {
+    it('should split text into chunks', () => {
+      const chunks = splitByTokens(ENGLISH_TEXT, 5)
+      expect(chunks.length).toBeGreaterThan(1)
+      expect(chunks.join('')).toBe(ENGLISH_TEXT)
+    })
+
+    it('should handle overlap between chunks', () => {
+      const chunksNoOverlap = splitByTokens(ENGLISH_TEXT, 5)
+      const chunksWithOverlap = splitByTokens(ENGLISH_TEXT, 5, { overlap: 2 })
+
+      expect(chunksWithOverlap.length).toBeGreaterThanOrEqual(chunksNoOverlap.length)
+
+      // With overlap, total character count across chunks should be greater
+      const totalWithOverlap = chunksWithOverlap.join('').length
+      const totalNoOverlap = chunksNoOverlap.join('').length
+      expect(totalWithOverlap).toBeGreaterThanOrEqual(totalNoOverlap)
+    })
+
+    it('should return empty array for empty input', () => {
+      expect(splitByTokens('', 5)).toEqual([])
+      expect(splitByTokens('text', 0)).toEqual([])
+      expect(splitByTokens('text', -5)).toEqual([])
+    })
+
+    it('should return single chunk when text is smaller than chunk size', () => {
+      const shortText = 'Hi there'
+      const chunks = splitByTokens(shortText, 100)
+      expect(chunks).toEqual([shortText])
+    })
+  })
 })