Skip to content

Commit

Permalink
feat: do not use surrogate pair if u-flag is specified
Browse files Browse the repository at this point in the history
  • Loading branch information
ikatyang committed Jul 9, 2023
1 parent d337df6 commit 978e5e5
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ const dResult = 'd'.test(regex) //=> false
```ts
declare abstract class Base {
isEmpty(): boolean
toString(): string
toString(flags?: string): string
toRegExp(flags?: string): RegExp
}
```
Expand Down
8 changes: 4 additions & 4 deletions src/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ export abstract class Base {
public isEmpty() {
return this._isEmpty()
}
public toString() {
public toString(flags?: string) {
if (this.isEmpty()) {
throw new Error(`Output is empty.`)
}
return this._toString()
return this._toString(flags)
}
public toRegExp(flags?: string) {
return new RegExp(this.toString(), flags)
return new RegExp(this.toString(flags), flags)
}

protected abstract _isEmpty(): boolean
protected abstract _toString(): string
protected abstract _toString(flags?: string): string
}
6 changes: 5 additions & 1 deletion src/charset.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ test('toString: normal syntax for <= 0xffff', () => {
expect(charset(0xffff).toString()).toEqual(`[\\uffff]`)
})

test('toString: surrogate pair for > 0xffff', () => {
test('toString: surrogate pair for > 0xffff without u-flag', () => {
expect(charset(0x10000).toString()).toEqual(`\\ud800[\\udc00]`)
expect(charset([0, 0x10000]).toString()).toEqual(
`[\\u0000-\\uffff]|\\ud800[\\udc00]`,
Expand All @@ -138,6 +138,10 @@ test('toString: surrogate pair for > 0xffff', () => {
)
})

test('toString: no surrogate pair for > 0xffff with u-flag', () => {
expect(charset([0, 0x10ffff]).toString('u')).toEqual(`[\\u{0}-\\u{10ffff}]`)
})

test('toRegExp: codepoint > 0xffff should not throw error without u-flag', () => {
expect(() => charset(0x10000).toRegExp()).not.toThrowError()
})
28 changes: 18 additions & 10 deletions src/charset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ export class Charset extends Base {
return this.data.length === 0
}

protected _toString() {
return rangesToString(this.data)
protected _toString(flags?: string) {
return rangesToString(this.data, flags)
}

private _unique() {
Expand Down Expand Up @@ -159,23 +159,29 @@ interface Surrogate {
partial: Array<{ h: number; l: Charset }>
}

function rangesToString(ranges: CharsetDataUnit[]) {
function rangesToString(ranges: CharsetDataUnit[], flags: string = '') {
if (flags.includes('u')) {
return normalToPattern(ranges, true)
}

const { normal, surrogate } = splitRanges(ranges)

const patterns: string[] = []

if (normal.length !== 0) {
patterns.push(normalToPattern(normal))
patterns.push(normalToPattern(normal, false))
}

patterns.push(...surrogateToPatterns(surrogate))

return patterns.join('|')
}

function normalToPattern(normal: CharsetDataUnit[]) {
function normalToPattern(normal: CharsetDataUnit[], hasUnicodeFlag: boolean) {
const ranges = normal.map(([start, end]) =>
start === end ? unicode(start) : `${unicode(start)}-${unicode(end)}`,
start === end
? unicode(start, hasUnicodeFlag)
: `${unicode(start, hasUnicodeFlag)}-${unicode(end, hasUnicodeFlag)}`,
)
return `[${ranges.join('')}]`
}
Expand All @@ -186,13 +192,13 @@ function surrogateToPatterns(surrogate: Surrogate) {
if (surrogate.entire.data.length !== 0) {
const h = surrogate.entire.toString()
const l = `[${[SurrogateLimit.MinL, SurrogateLimit.MaxL]
.map(unicode)
.map(_ => unicode(_, false))
.join('-')}]`
patterns.push(`${h}${l}`)
}

for (const { h: rawH, l: lCharset } of surrogate.partial) {
const h = unicode(rawH)
const h = unicode(rawH, false)
const l = lCharset.toString()
patterns.push(`${h}${l}`)
}
Expand Down Expand Up @@ -287,7 +293,9 @@ function surrogatePair(codepoint: number) {
}
}

function unicode(char: number) {
function unicode(char: number, hasUnicodeFlag: boolean) {
const hex = char.toString(16)
return `\\u${'0'.repeat(4 - hex.length)}${hex}`
return hasUnicodeFlag
? `\\u{${hex}}`
: `\\u${'0'.repeat(4 - hex.length)}${hex}`
}

0 comments on commit 978e5e5

Please sign in to comment.