diff --git a/src/json-schema/types.ts b/src/json-schema/types.ts index a19bef7d..19382c1d 100644 --- a/src/json-schema/types.ts +++ b/src/json-schema/types.ts @@ -16,6 +16,7 @@ export interface JsonSchemaString extends JsonSchemaGenericKeywords { type: 'string'; const?: string; format?: string; + pattern?: string; minLength?: number; maxLength?: number; } diff --git a/src/schema/schema.ts b/src/schema/schema.ts index 69f56d6b..f206373e 100644 --- a/src/schema/schema.ts +++ b/src/schema/schema.ts @@ -116,10 +116,21 @@ export interface NumberSchema extends TType, WithValidator { export interface StringSchema extends TType, WithValidator { kind: 'str'; + /** + * String format specification. When set, the string value will be validated + * according to the specified format for maximum performance. + * + * - "ascii" - Only ASCII characters (0-127) are allowed + * - "utf8" - Valid UTF-8 encoded strings are allowed + */ + format?: 'ascii' | 'utf8'; + /** * When set to true, means that the string can contain only ASCII characters. * This enables a range of optimizations, such as using a faster JSON * serialization, faster binary serialization. + * + * @deprecated Use `format: 'ascii'` instead. */ ascii?: boolean; diff --git a/src/type/__tests__/getJsonSchema.spec.ts b/src/type/__tests__/getJsonSchema.spec.ts index 1fd7789e..f55d1f9a 100644 --- a/src/type/__tests__/getJsonSchema.spec.ts +++ b/src/type/__tests__/getJsonSchema.spec.ts @@ -41,103 +41,86 @@ test('can print a type', () => { .options({unknownFields: true}); // console.log(JSON.stringify(type.toJsonSchema(), null, 2)); expect(type.toJsonSchema()).toMatchInlineSnapshot(` - { - "properties": { - "arrayProperty": { - "items": { - "type": [ - "string", - "number", - "boolean", - "null", - "array", - "object", - ], - }, - "type": "array", - }, - "binaryOperation": { - "type": "binary", +{ + "properties": { + "arrayProperty": { + "items": { + "type": [ + "string", + "number", + "boolean", + "null", + "array", + "object", + ], + }, + "type": "array", + }, + "binaryOperation": { + "type": "binary", + }, + "binaryProperty": { + "type": "binary", + }, + "booleanProperty": { + "type": "boolean", + }, + "enumAsConst": { + "anyOf": [ + { + "const": "a", + "type": "string", }, - "binaryProperty": { - "type": "binary", + { + "const": "b", + "type": "string", }, - "booleanProperty": { - "type": "boolean", + { + "const": "c", + "type": "string", }, - "enumAsConst": { - "anyOf": [ - { - "const": "a", - "type": "string", - }, - { - "const": "b", - "type": "string", - }, - { - "const": "c", - "type": "string", - }, - ], + ], + }, + "id": { + "type": "string", + }, + "map": { + "patternProperties": { + ".*": { + "type": "string", }, + }, + "type": "object", + }, + "numberProperty": { + "exclusiveMinimum": 3.14, + "type": "number", + }, + "objectProperty": { + "properties": { "id": { + "maxLength": 128, + "minLength": 3, + "pattern": "^[\\x00-\\x7F]*$", "type": "string", }, - "map": { - "patternProperties": { - ".*": { - "type": "string", - }, - }, - "type": "object", - }, - "numberProperty": { - "exclusiveMinimum": 3.14, - "type": "number", - }, - "objectProperty": { - "properties": { - "id": { - "maxLength": 128, - "minLength": 3, - "type": "string", - }, - }, - "required": [ - "id", - ], - "type": "object", + }, + "required": [ + "id", + ], + "type": "object", + }, + "operation": { + "properties": { + "path": { + "type": "string", }, - "operation": { - "properties": { - "path": { - "type": "string", - }, - "type": { - "const": "replace", - "title": "Always use replace", - "type": "string", - }, - "value": { - "type": [ - "string", - "number", - "boolean", - "null", - "array", - "object", - ], - }, - }, - "required": [ - "type", - "path", - "value", - ], - "type": "object", + "type": { + "const": "replace", + "title": "Always use replace", + "type": "string", }, - "optional": { + "value": { "type": [ "string", "number", @@ -147,51 +130,69 @@ test('can print a type', () => { "object", ], }, - "refField": { - "$ref": "#/$defs/refId", - }, - "tags": { - "items": { - "type": "string", - }, - "title": "Tags", - "type": "array", - }, - "und": { - "const": undefined, - "type": "undefined", - }, - "unionProperty": { - "anyOf": [ - { - "type": "string", - }, - { - "type": "number", - }, - { - "const": null, - "type": "object", - }, - ], - }, }, "required": [ - "id", - "tags", - "booleanProperty", - "numberProperty", - "binaryProperty", - "arrayProperty", - "objectProperty", - "unionProperty", - "operation", - "binaryOperation", - "map", + "type", + "path", + "value", ], "type": "object", - } - `); + }, + "optional": { + "type": [ + "string", + "number", + "boolean", + "null", + "array", + "object", + ], + }, + "refField": { + "$ref": "#/$defs/refId", + }, + "tags": { + "items": { + "type": "string", + }, + "title": "Tags", + "type": "array", + }, + "und": { + "const": undefined, + "type": "undefined", + }, + "unionProperty": { + "anyOf": [ + { + "type": "string", + }, + { + "type": "number", + }, + { + "const": null, + "type": "object", + }, + ], + }, + }, + "required": [ + "id", + "tags", + "booleanProperty", + "numberProperty", + "binaryProperty", + "arrayProperty", + "objectProperty", + "unionProperty", + "operation", + "binaryOperation", + "map", + ], + "type": "object", +} +`); }); test('exports "ref" type to JSON Schema "$defs"', () => { diff --git a/src/type/classes/StringType.ts b/src/type/classes/StringType.ts index 96979d84..2931d036 100644 --- a/src/type/classes/StringType.ts +++ b/src/type/classes/StringType.ts @@ -21,6 +21,7 @@ import type {json_string} from '@jsonjoy.com/util/lib/json-brand'; import type * as ts from '../../typescript/types'; import type {TypeExportContext} from '../../system/TypeExportContext'; import type * as jtd from '../../jtd/types'; +import {isAscii, isUtf8} from '../../util/stringFormats'; export class StringType extends AbstractType { constructor(protected schema: schema.StringSchema) { @@ -35,6 +36,18 @@ export class StringType extends AbstractType { }; if (schema.min !== undefined) jsonSchema.minLength = schema.min; if (schema.max !== undefined) jsonSchema.maxLength = schema.max; + // Add format to JSON Schema if specified + if (schema.format) { + if (schema.format === 'ascii') { + // JSON Schema doesn't have an "ascii" format, but we can use a pattern + // ASCII characters are from 0x00 to 0x7F (0-127) + jsonSchema.pattern = '^[\\x00-\\x7F]*$'; + } + // UTF-8 is the default for JSON Schema strings, so we don't need to add anything special + } else if (schema.ascii) { + // Backward compatibility: if ascii=true, add pattern + jsonSchema.pattern = '^[\\x00-\\x7F]*$'; + } return jsonSchema; } @@ -42,7 +55,7 @@ export class StringType extends AbstractType { const schema = this.getSchema(); validateTType(schema, 'str'); validateWithValidator(schema); - const {min, max, ascii, noJsonEscape} = schema; + const {min, max, ascii, noJsonEscape, format} = schema; validateMinMax(min, max); if (ascii !== undefined) { if (typeof ascii !== 'boolean') throw new Error('ASCII'); @@ -50,12 +63,21 @@ export class StringType extends AbstractType { if (noJsonEscape !== undefined) { if (typeof noJsonEscape !== 'boolean') throw new Error('NO_JSON_ESCAPE_TYPE'); } + if (format !== undefined) { + if (format !== 'ascii' && format !== 'utf8') { + throw new Error('INVALID_STRING_FORMAT'); + } + // If both format and ascii are specified, they should be consistent + if (ascii !== undefined && format === 'ascii' && !ascii) { + throw new Error('FORMAT_ASCII_MISMATCH'); + } + } } public codegenValidator(ctx: ValidatorCodegenContext, path: ValidationPath, r: string): void { const error = ctx.err(ValidationError.STR, path); ctx.js(/* js */ `if(typeof ${r} !== "string") return ${error};`); - const {min, max} = this.schema; + const {min, max, format, ascii} = this.schema; if (typeof min === 'number' && min === max) { const err = ctx.err(ValidationError.STR_LEN, path); ctx.js(/* js */ `if(${r}.length !== ${min}) return ${err};`); @@ -69,6 +91,22 @@ export class StringType extends AbstractType { ctx.js(/* js */ `if(${r}.length > ${max}) return ${err};`); } } + + if (format) { + const formatErr = ctx.err(ValidationError.STR, path); + if (format === 'ascii') { + const validateFn = ctx.codegen.linkDependency(isAscii); + ctx.js(/* js */ `if(!${validateFn}(${r})) return ${formatErr};`); + } else if (format === 'utf8') { + const validateFn = ctx.codegen.linkDependency(isUtf8); + ctx.js(/* js */ `if(!${validateFn}(${r})) return ${formatErr};`); + } + } else if (ascii) { + const asciiErr = ctx.err(ValidationError.STR, path); + const validateFn = ctx.codegen.linkDependency(isAscii); + ctx.js(/* js */ `if(!${validateFn}(${r})) return ${asciiErr};`); + } + ctx.emitCustomValidators(this, path, r); } @@ -81,9 +119,11 @@ export class StringType extends AbstractType { } private codegenBinaryEncoder(ctx: BinaryEncoderCodegenContext, value: JsExpression): void { - const ascii = this.schema.ascii; + const {ascii, format} = this.schema; const v = value.use(); - if (ascii) ctx.js(/* js */ `encoder.writeAsciiStr(${v});`); + // Use ASCII encoding if format is 'ascii' or ascii=true (backward compatibility) + const useAscii = format === 'ascii' || ascii; + if (useAscii) ctx.js(/* js */ `encoder.writeAsciiStr(${v});`); else ctx.js(/* js */ `encoder.writeStr(${v});`); } diff --git a/src/type/classes/__tests__/StringType.format.spec.ts b/src/type/classes/__tests__/StringType.format.spec.ts new file mode 100644 index 00000000..f678a043 --- /dev/null +++ b/src/type/classes/__tests__/StringType.format.spec.ts @@ -0,0 +1,122 @@ +import {t} from '../../..'; + +describe('StringType format validation', () => { + describe('ASCII format', () => { + const asciiType = t.String({format: 'ascii'}); + + test('accepts valid ASCII strings', () => { + const validator = asciiType.validator('boolean'); + expect(validator('hello world')).toBe(false); + expect(validator('123')).toBe(false); + expect(validator('!@#$%^&*()')).toBe(false); + expect(validator('')).toBe(false); + expect(validator('A')).toBe(false); + expect(validator(' ')).toBe(false); + }); + + test('rejects non-ASCII strings', () => { + const validator = asciiType.validator('boolean'); + expect(validator('héllo')).toBe(true); // é is not ASCII + expect(validator('🚀')).toBe(true); // Emoji + expect(validator('中文')).toBe(true); // Chinese characters + expect(validator('русский')).toBe(true); // Cyrillic + }); + + test('works with min/max length', () => { + const type = t.String({format: 'ascii', min: 2, max: 5}); + const validator = type.validator('boolean'); + + expect(validator('ab')).toBe(false); // Valid ASCII, correct length + expect(validator('abcde')).toBe(false); // Valid ASCII, correct length + expect(validator('a')).toBe(true); // Too short + expect(validator('abcdef')).toBe(true); // Too long + expect(validator('ñ')).toBe(true); // Non-ASCII (but would also be too short) + expect(validator('ñoño')).toBe(true); // Good length, but not ASCII + }); + }); + + describe('UTF-8 format', () => { + const utf8Type = t.String({format: 'utf8'}); + + test('accepts valid UTF-8 strings', () => { + const validator = utf8Type.validator('boolean'); + expect(validator('hello world')).toBe(false); + expect(validator('héllo')).toBe(false); + expect(validator('🚀')).toBe(false); + expect(validator('中文')).toBe(false); + expect(validator('русский')).toBe(false); + expect(validator('')).toBe(false); + }); + + test('rejects strings with unpaired surrogates', () => { + const validator = utf8Type.validator('boolean'); + // Create strings with unpaired surrogates + const highSurrogate = String.fromCharCode(0xd800); // High surrogate without low + const lowSurrogate = String.fromCharCode(0xdc00); // Low surrogate without high + + expect(validator(highSurrogate)).toBe(true); // Unpaired high surrogate + expect(validator(lowSurrogate)).toBe(true); // Orphaned low surrogate + expect(validator('hello' + highSurrogate)).toBe(true); // High surrogate at end + expect(validator(highSurrogate + lowSurrogate + highSurrogate)).toBe(true); // Unpaired at end + }); + + test('accepts valid surrogate pairs', () => { + const validator = utf8Type.validator('boolean'); + // Valid emoji with surrogate pairs + expect(validator('👍')).toBe(false); // Valid surrogate pair + expect(validator('💖')).toBe(false); // Valid surrogate pair + }); + }); + + describe('Backward compatibility with ascii boolean', () => { + test('ascii: true behaves like format: "ascii"', () => { + const asciiType = t.String({ascii: true}); + const validator = asciiType.validator('boolean'); + + expect(validator('hello')).toBe(false); // Valid ASCII + expect(validator('héllo')).toBe(true); // Non-ASCII + }); + + test('format takes precedence over ascii boolean', () => { + const type = t.String({format: 'utf8', ascii: true}); + const validator = type.validator('boolean'); + + // Should behave as UTF-8 validation, allowing non-ASCII + expect(validator('héllo')).toBe(false); // Should pass UTF-8 validation + }); + }); + + describe('Schema validation', () => { + test('validates format values', () => { + expect(() => t.String({format: 'ascii'}).validateSchema()).not.toThrow(); + expect(() => t.String({format: 'utf8'}).validateSchema()).not.toThrow(); + expect(() => t.String({format: 'invalid' as any}).validateSchema()).toThrow('INVALID_STRING_FORMAT'); + }); + + test('validates format and ascii consistency', () => { + expect(() => t.String({format: 'ascii', ascii: false}).validateSchema()).toThrow('FORMAT_ASCII_MISMATCH'); + expect(() => t.String({format: 'ascii', ascii: true}).validateSchema()).not.toThrow(); + expect(() => t.String({format: 'utf8', ascii: true}).validateSchema()).not.toThrow(); // UTF-8 can have ascii=true + }); + }); + + describe('JSON Schema export', () => { + test('ASCII format adds pattern', () => { + const type = t.String({format: 'ascii'}); + const jsonSchema = type.toJsonSchema(); + expect(jsonSchema.pattern).toBe('^[\\x00-\\x7F]*$'); + }); + + test('UTF-8 format does not add pattern', () => { + const type = t.String({format: 'utf8'}); + const jsonSchema = type.toJsonSchema(); + expect(jsonSchema.pattern).toBeUndefined(); + }); + + test('backward compatibility with ascii boolean', () => { + const type = t.String({ascii: true}); + const jsonSchema = type.toJsonSchema(); + expect(jsonSchema.pattern).toBe('^[\\x00-\\x7F]*$'); + }); + }); +}); diff --git a/src/util/__tests__/stringFormats.spec.ts b/src/util/__tests__/stringFormats.spec.ts new file mode 100644 index 00000000..22dfecaf --- /dev/null +++ b/src/util/__tests__/stringFormats.spec.ts @@ -0,0 +1,103 @@ +import {isAscii, isUtf8, validateStringFormat} from '../stringFormats'; + +describe('String format validation utilities', () => { + describe('isAscii', () => { + test('returns true for ASCII strings', () => { + expect(isAscii('')).toBe(true); + expect(isAscii('hello')).toBe(true); + expect(isAscii('Hello World!')).toBe(true); + expect(isAscii('123456789')).toBe(true); + expect(isAscii('!@#$%^&*()')).toBe(true); + expect(isAscii(' \t\n\r')).toBe(true); + expect(isAscii(String.fromCharCode(0))).toBe(true); // NULL character + expect(isAscii(String.fromCharCode(127))).toBe(true); // DEL character + }); + + test('returns false for non-ASCII strings', () => { + expect(isAscii('héllo')).toBe(false); // é = U+00E9 = 233 + expect(isAscii('café')).toBe(false); // é = U+00E9 = 233 + expect(isAscii('naïve')).toBe(false); // ï = U+00EF = 239 + expect(isAscii('🚀')).toBe(false); // Emoji + expect(isAscii('中文')).toBe(false); // Chinese characters + expect(isAscii('русский')).toBe(false); // Cyrillic + expect(isAscii(String.fromCharCode(128))).toBe(false); // First non-ASCII + expect(isAscii(String.fromCharCode(255))).toBe(false); // Latin-1 Supplement + }); + + test('handles edge cases', () => { + expect(isAscii('hello' + String.fromCharCode(128))).toBe(false); + expect(isAscii(String.fromCharCode(127) + 'hello')).toBe(true); + }); + }); + + describe('isUtf8', () => { + test('returns true for valid UTF-8 strings', () => { + expect(isUtf8('')).toBe(true); + expect(isUtf8('hello')).toBe(true); + expect(isUtf8('héllo')).toBe(true); + expect(isUtf8('🚀')).toBe(true); + expect(isUtf8('中文')).toBe(true); + expect(isUtf8('русский')).toBe(true); + expect(isUtf8('👍💖🎉')).toBe(true); // Multiple emojis with surrogate pairs + }); + + test('returns false for unpaired high surrogates', () => { + const highSurrogate = String.fromCharCode(0xd800); + expect(isUtf8(highSurrogate)).toBe(false); + expect(isUtf8('hello' + highSurrogate)).toBe(false); + expect(isUtf8(highSurrogate + 'world')).toBe(false); + }); + + test('returns false for orphaned low surrogates', () => { + const lowSurrogate = String.fromCharCode(0xdc00); + expect(isUtf8(lowSurrogate)).toBe(false); + expect(isUtf8('hello' + lowSurrogate)).toBe(false); + expect(isUtf8(lowSurrogate + 'world')).toBe(false); + }); + + test('returns false for high surrogate not followed by low surrogate', () => { + const highSurrogate = String.fromCharCode(0xd800); + const notLowSurrogate = String.fromCharCode(0xe000); // Outside surrogate range + expect(isUtf8(highSurrogate + notLowSurrogate)).toBe(false); + expect(isUtf8(highSurrogate + 'a')).toBe(false); + }); + + test('returns true for valid surrogate pairs', () => { + // Create a valid surrogate pair manually + const highSurrogate = String.fromCharCode(0xd800); + const lowSurrogate = String.fromCharCode(0xdc00); + expect(isUtf8(highSurrogate + lowSurrogate)).toBe(true); + + // Test with real emoji + expect(isUtf8('👨‍💻')).toBe(true); // Complex emoji with ZWJ + expect(isUtf8('🏳️‍🌈')).toBe(true); // Rainbow flag emoji + }); + + test('handles sequences correctly', () => { + const highSurrogate = String.fromCharCode(0xd800); + const lowSurrogate = String.fromCharCode(0xdc00); + const validPair = highSurrogate + lowSurrogate; + + expect(isUtf8(validPair + validPair)).toBe(true); // Two valid pairs + expect(isUtf8(validPair + highSurrogate)).toBe(false); // Valid pair + unpaired high + expect(isUtf8('hello' + validPair + 'world')).toBe(true); // Valid pair in middle + }); + }); + + describe('validateStringFormat', () => { + test('delegates to isAscii for ascii format', () => { + expect(validateStringFormat('hello', 'ascii')).toBe(true); + expect(validateStringFormat('héllo', 'ascii')).toBe(false); + }); + + test('delegates to isUtf8 for utf8 format', () => { + expect(validateStringFormat('hello', 'utf8')).toBe(true); + expect(validateStringFormat('héllo', 'utf8')).toBe(true); + expect(validateStringFormat(String.fromCharCode(0xd800), 'utf8')).toBe(false); + }); + + test('returns true for invalid format (defensive)', () => { + expect(validateStringFormat('hello', 'invalid' as any)).toBe(true); + }); + }); +}); diff --git a/src/util/stringFormats.ts b/src/util/stringFormats.ts new file mode 100644 index 00000000..7a44b6b7 --- /dev/null +++ b/src/util/stringFormats.ts @@ -0,0 +1,66 @@ +/** + * High-performance string format validation utilities. + * These functions are optimized for maximum performance. + */ + +/** + * Validates if a string contains only ASCII characters (0-127). + * This is highly optimized for performance. + */ +export const isAscii = (str: string): boolean => { + const length = str.length; + for (let i = 0; i < length; i++) { + if (str.charCodeAt(i) > 127) { + return false; + } + } + return true; +}; + +/** + * Validates if a string represents valid UTF-8 when encoded. + * JavaScript strings are UTF-16, but we need to validate they don't contain + * invalid Unicode sequences that would produce invalid UTF-8. + * + * This checks for: + * - Unpaired surrogates (invalid UTF-16 sequences) + * - Characters that would produce invalid UTF-8 + */ +export const isUtf8 = (str: string): boolean => { + const length = str.length; + for (let i = 0; i < length; i++) { + const code = str.charCodeAt(i); + + // Check for high surrogate + if (code >= 0xd800 && code <= 0xdbff) { + // High surrogate must be followed by low surrogate + if (i + 1 >= length) { + return false; // Unpaired high surrogate at end + } + const nextCode = str.charCodeAt(i + 1); + if (nextCode < 0xdc00 || nextCode > 0xdfff) { + return false; // High surrogate not followed by low surrogate + } + i++; // Skip the low surrogate + } else if (code >= 0xdc00 && code <= 0xdfff) { + // Low surrogate without preceding high surrogate + return false; + } + // All other characters (0x0000-0xD7FF and 0xE000-0xFFFF) are valid + } + return true; +}; + +/** + * Validates a string according to the specified format. + */ +export const validateStringFormat = (str: string, format: 'ascii' | 'utf8'): boolean => { + switch (format) { + case 'ascii': + return isAscii(str); + case 'utf8': + return isUtf8(str); + default: + return true; + } +};