feat: support accented characters

johannschopplich · johannschopplich · commit c4133af1f19d · 2023-11-27T11:49:11.000+01:00
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ For advanced use cases, please use a full tokenizer like [`gpt-tokenizer`](https
 
 - 🌁 Estimate token count without a full tokenizer
 - 📐 Supports multiple model context sizes
+- 🗣️ Supports accented characters, like German umlauts or French accents
 - 🪽 Zero dependencies
 
 ## Installation
diff --git a/src/index.ts b/src/index.ts
@@ -45,7 +45,8 @@ export function getEmbeddingContextSize(modelName?: string): number {
 }
 
 const WHITESPACE_RE = /^\s+$/
-const ALPHANUMERIC_RE = /^[a-zA-Z0-9]+$/
+// Include alphanumeric characters and accented characters
+const ALPHANUMERIC_RE = /^[a-zA-Z0-9\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]+$/
 const PUNCTUATION_RE = /[.,!?;'"„“”‘’\-(){}[\]<>:/\\|@#$%^&*+=`~]/
 
 export function approximateTokenSize(input: string) {
@@ -54,7 +55,6 @@ export function approximateTokenSize(input: string) {
     .split(/(\s+|[.,!?;'"„“”‘’\-(){}[\]<>:/\\|@#$%^&*+=`~]+)/)
     .filter(Boolean)
 
-  // Approximate the size of tokens by considering common English patterns
   let tokenCount = 0
   for (const token of roughTokens) {
     if (WHITESPACE_RE.test(token)) {
@@ -67,7 +67,7 @@ export function approximateTokenSize(input: string) {
     }
     else if (ALPHANUMERIC_RE.test(token)) {
       // Increase the average token length for alphanumeric strings
-      tokenCount += Math.ceil(token.length / 5)
+      tokenCount += Math.ceil(token.length / 4)
     }
     else if (PUNCTUATION_RE.test(token)) {
       // Punctuation is often a single token, but multiple punctuations are often split
diff --git a/test/index.test.ts b/test/index.test.ts
@@ -10,12 +10,12 @@ describe('token-related functions', () => {
   describe('approximateTokenSize', () => {
     it('should approximate the token size for English text', () => {
       const input = 'Hello, world! This is a test.'
-      expect(approximateTokenSize(input)).toMatchInlineSnapshot('9')
+      expect(approximateTokenSize(input)).toMatchInlineSnapshot('11')
     })
 
     it('should approximate the token size for German text with special characters', () => {
       const input = 'Guten Tag! Wie geht’s dir?'
-      expect(approximateTokenSize(input)).toMatchInlineSnapshot('9')
+      expect(approximateTokenSize(input)).toMatchInlineSnapshot('10')
     })
   })