File tree Expand file tree Collapse file tree 2 files changed +10
-5
lines changed Expand file tree Collapse file tree 2 files changed +10
-5
lines changed Original file line number Diff line number Diff line change @@ -10,10 +10,10 @@ The following table shows the accuracy of the token count approximation for diff
1010| ----------- | ---------------------- | --------------------- | ------------------------- |
1111| Short English text | 10 | 11 | 10.00% |
1212| German text with umlauts | 56 | 49 | 12.50% |
13- | Metamorphosis by Franz Kafka (English) | 31891 | 33930 | 6.39% |
14- | Die Verwandlung by Franz Kafka (German) | 40620 | 34909 | 14.06% |
15- | 道德經 by Laozi (Chinese) | 14386 | 11920 | 17.14 % |
16- | TypeScript ES5 Type Declarations (~ 4000 loc) | 47890 | 50898 | 6.28 % |
13+ | Metamorphosis by Franz Kafka (English) | 31891 | 33928 | 6.39% |
14+ | Die Verwandlung by Franz Kafka (German) | 40620 | 34908 | 14.06% |
15+ | 道德經 by Laozi (Chinese) | 14386 | 11919 | 17.15 % |
16+ | TypeScript ES5 Type Declarations (~ 4000 loc) | 47890 | 50464 | 5.37 % |
1717
1818## Features
1919
Original file line number Diff line number Diff line change @@ -56,6 +56,7 @@ const CJK_RE = /[\u4E00-\u9FFF\u3400-\u4DBF\u3000-\u303F\uFF00-\uFFEF\u30A0-\u30
5656const ALPHANUMERIC_RE = / ^ [ a - z A - Z 0 - 9 \u00C0 - \u00D6 \u00D8 - \u00F6 \u00F8 - \u00FF ] + $ /
5757const PUNCTUATION_RE = / [ . , ! ? ; ' " „ “ ” ‘ ’ \- ( ) { } [ \] < > : / \\ | @ # $ % ^ & * + = ` ~ ] /
5858const GERMAN_RE = / [ ä ö ü ß Ä Ö Ü ẞ ] /
59+ const NUMERIC_SEQUENCE_RE = / [ \d . , ] + /
5960
6061/**
6162 * Estimate the number of tokens in a string.
@@ -76,6 +77,10 @@ export function approximateTokenSize(input: string) {
7677 // For CJK languages, each character is usually a separate token
7778 tokenCount += Array . from ( token ) . length
7879 }
80+ else if ( NUMERIC_SEQUENCE_RE . test ( token ) ) {
81+ // Numeric sequences are often a single token, regardless of length
82+ tokenCount += 1
83+ }
7984 else if ( token . length <= 3 ) {
8085 // Short tokens are often a single token
8186 tokenCount += 1
@@ -85,7 +90,7 @@ export function approximateTokenSize(input: string) {
8590 tokenCount += Math . ceil ( token . length / 3 )
8691 }
8792 else if ( ALPHANUMERIC_RE . test ( token ) ) {
88- // Increase the average token length for alphanumeric strings
93+ // Use an average of 6 characters per token for alphanumeric tokens
8994 tokenCount += Math . ceil ( token . length / 6 )
9095 }
9196 else if ( PUNCTUATION_RE . test ( token ) ) {
You can’t perform that action at this time.
0 commit comments