Skip to content

Commit b9db65e

Browse files
feat: new splitByTokens function
1 parent 2476ea3 commit b9db65e

File tree

4 files changed

+155
-2
lines changed

4 files changed

+155
-2
lines changed

README.md

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ yarn add tokenx
4848
## Usage
4949

5050
```ts
51-
import { estimateTokenCount, isWithinTokenLimit } from 'tokenx'
51+
import { estimateTokenCount, isWithinTokenLimit, splitByTokens } from 'tokenx'
5252

5353
const text = 'Your text goes here.'
5454

@@ -61,6 +61,10 @@ const tokenLimit = 1024
6161
const withinLimit = isWithinTokenLimit(text, tokenLimit)
6262
console.log(`Is within token limit: ${withinLimit}`)
6363

64+
// Split text into token-based chunks
65+
const chunks = splitByTokens(text, 100)
66+
console.log(`Split into ${chunks.length} chunks`)
67+
6468
// Use custom options for different languages or models
6569
const customOptions = {
6670
defaultCharsPerToken: 4, // More conservative estimation
@@ -183,6 +187,54 @@ function sliceByTokens(
183187

184188
The sliced text portion corresponding to the specified token range.
185189

190+
### `splitByTokens`
191+
192+
Splits text into chunks based on token count. Useful for chunking documents for RAG, batch processing, or staying within context windows.
193+
194+
**Usage:**
195+
196+
```ts
197+
const text = 'Long text that needs to be split into smaller chunks...'
198+
199+
// Basic splitting
200+
const chunks = splitByTokens(text, 100)
201+
console.log(`Split into ${chunks.length} chunks`)
202+
203+
// With overlap for semantic continuity
204+
const overlappedChunks = splitByTokens(text, 100, { overlap: 10 })
205+
206+
// With custom options
207+
const customChunks = splitByTokens(text, 50, {
208+
defaultCharsPerToken: 4,
209+
overlap: 5
210+
})
211+
```
212+
213+
**Type Declaration:**
214+
215+
```ts
216+
interface SplitByTokensOptions extends TokenEstimationOptions {
217+
/** Number of tokens to overlap between consecutive chunks (default: 0) */
218+
overlap?: number
219+
}
220+
221+
function splitByTokens(
222+
text: string,
223+
tokensPerChunk: number,
224+
options?: SplitByTokensOptions
225+
): string[]
226+
```
227+
228+
**Parameters:**
229+
230+
- `text` - The input text to split
231+
- `tokensPerChunk` - Maximum number of tokens per chunk
232+
- `options` - Token estimation options with optional overlap
233+
234+
**Returns:**
235+
236+
An array of text chunks, each containing approximately `tokensPerChunk` tokens.
237+
186238
## License
187239

188240
[MIT](./LICENSE) License © 2023-PRESENT [Johann Schopplich](https://github.com/johannschopplich)

src/index.ts

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { LanguageConfig, TokenEstimationOptions } from './types'
1+
import type { LanguageConfig, SplitByTokensOptions, TokenEstimationOptions } from './types'
22

33
export * from './types'
44

@@ -109,6 +109,66 @@ export function sliceByTokens(
109109
return parts.join('')
110110
}
111111

112+
/**
113+
* Splits text into chunks based on token count.
114+
*/
115+
export function splitByTokens(
116+
text: string,
117+
tokensPerChunk: number,
118+
options: SplitByTokensOptions = {},
119+
): string[] {
120+
if (!text || tokensPerChunk <= 0)
121+
return []
122+
123+
const {
124+
defaultCharsPerToken = DEFAULT_CHARS_PER_TOKEN,
125+
languageConfigs = DEFAULT_LANGUAGE_CONFIGS,
126+
overlap = 0,
127+
} = options
128+
129+
const segments = text.split(TOKEN_SPLIT_PATTERN).filter(Boolean)
130+
const chunks: string[] = []
131+
let currentChunk: string[] = []
132+
let currentTokenCount = 0
133+
134+
for (const segment of segments) {
135+
const tokenCount = estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken)
136+
137+
currentChunk.push(segment)
138+
currentTokenCount += tokenCount
139+
140+
if (currentTokenCount >= tokensPerChunk) {
141+
chunks.push(currentChunk.join(''))
142+
143+
// Calculate overlap for next chunk
144+
if (overlap > 0) {
145+
const overlapSegments: string[] = []
146+
let overlapTokenCount = 0
147+
148+
for (let i = currentChunk.length - 1; i >= 0 && overlapTokenCount < overlap; i--) {
149+
const segmentValue = currentChunk[i]!
150+
const tokCount = estimateSegmentTokens(segmentValue, languageConfigs, defaultCharsPerToken)
151+
overlapSegments.unshift(segmentValue)
152+
overlapTokenCount += tokCount
153+
}
154+
155+
currentChunk = overlapSegments
156+
currentTokenCount = overlapTokenCount
157+
}
158+
else {
159+
currentChunk = []
160+
currentTokenCount = 0
161+
}
162+
}
163+
}
164+
165+
// Add remaining content as last chunk
166+
if (currentChunk.length > 0)
167+
chunks.push(currentChunk.join(''))
168+
169+
return chunks
170+
}
171+
112172
function estimateSegmentTokens(
113173
segment: string,
114174
languageConfigs: LanguageConfig[],

src/types.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,11 @@ export interface LanguageConfig {
1717
/** Average number of characters per token for this language */
1818
averageCharsPerToken: number
1919
}
20+
21+
/**
22+
* Configuration options for splitting text by tokens
23+
*/
24+
export interface SplitByTokensOptions extends TokenEstimationOptions {
25+
/** Number of tokens to overlap between consecutive chunks (default: 0) */
26+
overlap?: number
27+
}

test/index.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
estimateTokenCount,
77
isWithinTokenLimit,
88
sliceByTokens,
9+
splitByTokens,
910
} from '../src/index'
1011

1112
const fixturesDir = fileURLToPath(new URL('fixtures', import.meta.url))
@@ -139,4 +140,36 @@ describe('token-related functions', () => {
139140
expect(sliceByTokens(GERMAN_TEXT, -1000)).toBe(GERMAN_TEXT)
140141
})
141142
})
143+
144+
describe('splitByTokens', () => {
145+
it('should split text into chunks', () => {
146+
const chunks = splitByTokens(ENGLISH_TEXT, 5)
147+
expect(chunks.length).toBeGreaterThan(1)
148+
expect(chunks.join('')).toBe(ENGLISH_TEXT)
149+
})
150+
151+
it('should handle overlap between chunks', () => {
152+
const chunksNoOverlap = splitByTokens(ENGLISH_TEXT, 5)
153+
const chunksWithOverlap = splitByTokens(ENGLISH_TEXT, 5, { overlap: 2 })
154+
155+
expect(chunksWithOverlap.length).toBeGreaterThanOrEqual(chunksNoOverlap.length)
156+
157+
// With overlap, total character count across chunks should be greater
158+
const totalWithOverlap = chunksWithOverlap.join('').length
159+
const totalNoOverlap = chunksNoOverlap.join('').length
160+
expect(totalWithOverlap).toBeGreaterThanOrEqual(totalNoOverlap)
161+
})
162+
163+
it('should return empty array for empty input', () => {
164+
expect(splitByTokens('', 5)).toEqual([])
165+
expect(splitByTokens('text', 0)).toEqual([])
166+
expect(splitByTokens('text', -5)).toEqual([])
167+
})
168+
169+
it('should return single chunk when text is smaller than chunk size', () => {
170+
const shortText = 'Hi there'
171+
const chunks = splitByTokens(shortText, 100)
172+
expect(chunks).toEqual([shortText])
173+
})
174+
})
142175
})

0 commit comments

Comments
 (0)