Skip to content

Commit c4133af

Browse files
feat: support accented characters
1 parent c54b382 commit c4133af

File tree

3 files changed

+6
-5
lines changed

3 files changed

+6
-5
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ For advanced use cases, please use a full tokenizer like [`gpt-tokenizer`](https
88

99
- 🌁 Estimate token count without a full tokenizer
1010
- 📐 Supports multiple model context sizes
11+
- 🗣️ Supports accented characters, like German umlauts or French accents
1112
- 🪽 Zero dependencies
1213

1314
## Installation

src/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ export function getEmbeddingContextSize(modelName?: string): number {
4545
}
4646

4747
const WHITESPACE_RE = /^\s+$/
48-
const ALPHANUMERIC_RE = /^[a-zA-Z0-9]+$/
48+
// Include alphanumeric characters and accented characters
49+
const ALPHANUMERIC_RE = /^[a-zA-Z0-9\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]+$/
4950
const PUNCTUATION_RE = /[.,!?;'"\-(){}[\]<>:/\\|@#$%^&*+=`~]/
5051

5152
export function approximateTokenSize(input: string) {
@@ -54,7 +55,6 @@ export function approximateTokenSize(input: string) {
5455
.split(/(\s+|[.,!?;'"\-(){}[\]<>:/\\|@#$%^&*+=`~]+)/)
5556
.filter(Boolean)
5657

57-
// Approximate the size of tokens by considering common English patterns
5858
let tokenCount = 0
5959
for (const token of roughTokens) {
6060
if (WHITESPACE_RE.test(token)) {
@@ -67,7 +67,7 @@ export function approximateTokenSize(input: string) {
6767
}
6868
else if (ALPHANUMERIC_RE.test(token)) {
6969
// Increase the average token length for alphanumeric strings
70-
tokenCount += Math.ceil(token.length / 5)
70+
tokenCount += Math.ceil(token.length / 4)
7171
}
7272
else if (PUNCTUATION_RE.test(token)) {
7373
// Punctuation is often a single token, but multiple punctuations are often split

test/index.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@ describe('token-related functions', () => {
1010
describe('approximateTokenSize', () => {
1111
it('should approximate the token size for English text', () => {
1212
const input = 'Hello, world! This is a test.'
13-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('9')
13+
expect(approximateTokenSize(input)).toMatchInlineSnapshot('11')
1414
})
1515

1616
it('should approximate the token size for German text with special characters', () => {
1717
const input = 'Guten Tag! Wie geht’s dir?'
18-
expect(approximateTokenSize(input)).toMatchInlineSnapshot('9')
18+
expect(approximateTokenSize(input)).toMatchInlineSnapshot('10')
1919
})
2020
})
2121

0 commit comments

Comments
 (0)