From dd7616acb9a71b77f39d2fa24b6f68a7caef87f1 Mon Sep 17 00:00:00 2001 From: Harttle Date: Sat, 4 Nov 2023 21:33:29 +0800 Subject: [PATCH] fix: allow unicode to be identifiers, fixes #655 --- bin/character-gen.js | 16 ++++++++++++---- bin/perf-diff.sh | 4 ++-- .../navy/layout/partial/all-contributors.swig | 7 +++++++ package.json | 1 + src/parser/tokenizer.ts | 8 ++++---- src/util/character.ts | 9 ++++++++- src/util/operator-trie.ts | 4 ++-- test/e2e/issues.spec.ts | 5 +++++ 8 files changed, 41 insertions(+), 13 deletions(-) mode change 100644 => 100755 bin/character-gen.js diff --git a/bin/character-gen.js b/bin/character-gen.js old mode 100644 new mode 100755 index 1615791ccd..183db3e872 --- a/bin/character-gen.js +++ b/bin/character-gen.js @@ -4,18 +4,19 @@ const isQuote = c => c === '"' || c === "'" const isOperator = c => '!=<>'.includes(c) const isNumber = c => c >= '0' && c <= '9' const isCharacter = c => (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') -const isIdentifier = c => '_-?'.includes(c) || isCharacter(c) || isNumber(c) -const isBlank = c => c === '\n' || c === '\t' || c === ' ' || c === '\r' || c === '\v' || c === '\f' +const isWord = c => '_-?'.includes(c) || isCharacter(c) || isNumber(c) +const isBlank = c => '\n\t \r\v\f'.includes(c) const isInlineBlank = c => c === '\t' || c === ' ' || c === '\r' const isSign = c => c === '-' || c === '+' // See https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Global_Objects/RegExp const unicodeBlanks = '\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000' +const unicodePunctuations = '“”' const types = [] for (let i = 0; i < 128; i++) { const c = String.fromCharCode(i) let n = 0 - if (isIdentifier(c)) n |= 1 + if (isWord(c)) n |= 1 if (isOperator(c)) n |= 2 if (isBlank(c)) n |= 4 if (isQuote(c)) n |= 8 @@ -31,13 +32,20 @@ console.log(` // This file is generated by bin/character-gen.js // bitmask character types to boost performance export const TYPES = [${types.join(', ')}] -export const IDENTIFIER = 1 +export const WORD = 1 export const OPERATOR = 2 export const BLANK = 4 export const QUOTE = 8 export const INLINE_BLANK = 16 export const NUMBER = 32 export const SIGN = 64 +export const PUNCTUATION = 128 + +export function isWord (char: string): boolean { + const code = char.charCodeAt(0) + return code >= 128 ? !TYPES[code] : !!(TYPES[code] & WORD) +} `.trim()) console.log([...unicodeBlanks].map(char => `TYPES[${char.charCodeAt(0)}]`).join(' = ') + ' = BLANK') +console.log([...unicodePunctuations].map(char => `TYPES[${char.charCodeAt(0)}]`).join(' = ') + ' = PUNCTUATION') diff --git a/bin/perf-diff.sh b/bin/perf-diff.sh index f2ce1aaf7d..25a1a9363a 100755 --- a/bin/perf-diff.sh +++ b/bin/perf-diff.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -VERSION_LATEST=$(cat package.json | grep '"version":' | awk -F'"' '{print $4}') +VERSION_LATEST=$(cat package.json | grep '"version":' | head -1 | awk -F'"' '{print $4}') FILE_LOCAL=dist/liquid.node.cjs.js FILE_LATEST=dist/liquid.node.cjs.$VERSION_LATEST.js URL_LATEST=https://unpkg.com/liquidjs@$VERSION_LATEST/dist/liquid.node.cjs.js -if [ ! -f $FILE_LATEST ]; then +if [ ! -f "$FILE_LATEST" ]; then curl $URL_LATEST > $FILE_LATEST fi diff --git a/docs/themes/navy/layout/partial/all-contributors.swig b/docs/themes/navy/layout/partial/all-contributors.swig index 1524163497..4af70907e7 100644 --- a/docs/themes/navy/layout/partial/all-contributors.swig +++ b/docs/themes/navy/layout/partial/all-contributors.swig @@ -68,6 +68,13 @@ Mahyar Pasarzangene Tomáš Hübelbauer Jason Garber + Checkout Blocks + Adam Darrah + Eleventy + Nick Reilingh + Francisco Soto + David LJ + Rasmus Wriedt Larsen diff --git a/package.json b/package.json index df0e24ef9f..1eb3c5c763 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "build:cjs": "BUNDLES=cjs rollup -c rollup.config.mjs", "build:min": "BUNDLES=min rollup -c rollup.config.mjs", "build:umd": "BUNDLES=umd rollup -c rollup.config.mjs", + "build:charmap": "./bin/character-gen.js > src/util/character.ts", "build:docs": "bin/build-docs.sh" }, "bin": { diff --git a/src/parser/tokenizer.ts b/src/parser/tokenizer.ts index 75af342469..5adc8d87f9 100644 --- a/src/parser/tokenizer.ts +++ b/src/parser/tokenizer.ts @@ -1,6 +1,6 @@ import { FilteredValueToken, TagToken, HTMLToken, HashToken, QuotedToken, LiquidTagToken, OutputToken, ValueToken, Token, RangeToken, FilterToken, TopLevelToken, PropertyAccessToken, OperatorToken, LiteralToken, IdentifierToken, NumberToken } from '../tokens' import { OperatorHandler } from '../render/operator' -import { TrieNode, LiteralValue, Trie, createTrie, ellipsis, literalValues, TokenizationError, TYPES, QUOTE, BLANK, IDENTIFIER, NUMBER, SIGN } from '../util' +import { TrieNode, LiteralValue, Trie, createTrie, ellipsis, literalValues, TokenizationError, TYPES, QUOTE, BLANK, NUMBER, SIGN, isWord } from '../util' import { Operators, Expression } from '../render' import { NormalizedFullOptions, defaultOptions } from '../liquid-options' import { FilterArg } from './filter-arg' @@ -59,7 +59,7 @@ export class Tokenizer { if (node['end']) info = node } if (!info) return -1 - if (info['needBoundary'] && (this.peekType(i - this.p) & IDENTIFIER)) return -1 + if (info['needBoundary'] && isWord(this.peek(i - this.p))) return -1 return i } readFilteredValue (): FilteredValueToken { @@ -245,7 +245,7 @@ export class Tokenizer { readIdentifier (): IdentifierToken { this.skipBlank() const begin = this.p - while (!this.end() && this.peekType() & IDENTIFIER) ++this.p + while (!this.end() && isWord(this.peek())) ++this.p return new IdentifierToken(this.input, begin, this.p, this.file) } @@ -351,7 +351,7 @@ export class Tokenizer { n++ } else break } - if (digitFound && !(this.peekType(n) & IDENTIFIER)) { + if (digitFound && !isWord(this.peek(n))) { const num = new NumberToken(this.input, this.p, this.p + n, this.file) this.advance(n) return num diff --git a/src/util/character.ts b/src/util/character.ts index cbec80b360..472a0fc5f5 100644 --- a/src/util/character.ts +++ b/src/util/character.ts @@ -3,11 +3,18 @@ // This file is generated by bin/character-gen.js // bitmask character types to boost performance export const TYPES = [0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 4, 4, 4, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 2, 8, 0, 0, 0, 0, 8, 0, 0, 0, 64, 0, 65, 0, 0, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] -export const IDENTIFIER = 1 +export const WORD = 1 export const OPERATOR = 2 export const BLANK = 4 export const QUOTE = 8 export const INLINE_BLANK = 16 export const NUMBER = 32 export const SIGN = 64 +export const PUNCTUATION = 128 + +export function isWord (char: string): boolean { + const code = char.charCodeAt(0) + return code >= 128 ? !TYPES[code] : !!(TYPES[code] & WORD) +} TYPES[160] = TYPES[5760] = TYPES[6158] = TYPES[8192] = TYPES[8193] = TYPES[8194] = TYPES[8195] = TYPES[8196] = TYPES[8197] = TYPES[8198] = TYPES[8199] = TYPES[8200] = TYPES[8201] = TYPES[8202] = TYPES[8232] = TYPES[8233] = TYPES[8239] = TYPES[8287] = TYPES[12288] = BLANK +TYPES[8220] = TYPES[8221] = PUNCTUATION diff --git a/src/util/operator-trie.ts b/src/util/operator-trie.ts index 7ca32bdd9e..e09c90ef02 100644 --- a/src/util/operator-trie.ts +++ b/src/util/operator-trie.ts @@ -1,4 +1,4 @@ -import { IDENTIFIER, TYPES } from '../util/character' +import { isWord } from '../util/character' interface TrieInput { [key: string]: T @@ -25,7 +25,7 @@ export function createTrie (input: TrieInput): Trie { const c = name[i] node[c] = node[c] || {} - if (i === name.length - 1 && (TYPES[name.charCodeAt(i)] & IDENTIFIER)) { + if (i === name.length - 1 && isWord(name[i])) { node[c].needBoundary = true } diff --git a/test/e2e/issues.spec.ts b/test/e2e/issues.spec.ts index eee46ea3fb..a195b49194 100644 --- a/test/e2e/issues.spec.ts +++ b/test/e2e/issues.spec.ts @@ -464,4 +464,9 @@ describe('Issues', function () { } expect(engine.parseAndRenderSync(tpl, ctx)).toEqual('FOO') }) + it('#655 Error in the tokenization process due to an invalid value expression', () => { + const engine = new Liquid() + const result = engine.parseAndRenderSync('{{ÜLKE}}', { ÜLKE: 'Türkiye' }) + expect(result).toEqual('Türkiye') + }) })