From 373b9d862401162e832ce77305e49b859e110f9c Mon Sep 17 00:00:00 2001 From: Josh Goebel Date: Thu, 3 Dec 2020 12:25:42 -0500 Subject: [PATCH] Merge pull request from GHSA-7wwv-vh3v-89cq * enh(tests) analyze regex for catastrophic backtracking * allow testing individual languages * fix(routeros) resolve potential exponential backtracking issue * fix(powershell) resolve potential exponential backtracking issue * fix(erlang-repl) resolve backtracking issue * fix(r) resolve backtracking issue * fix(jboss-cli) resolve backtracking issue * (lint) perl * fix(perl) resolve exponential backtracking issue * fix(gams) resolve exponential backtracking issue * (lint) c-like * fix(handlebars) resolve exponential backtracking issue * fix(cpp) resolve exponential backtracking issue * fix(sqf) fix poly backtracking issue - The `_` case should already be handled by `_+` * fix(xquery) fix poly backtracking issue * fix(ruleslanguage) fix poly backtracking issue The extra expression here does not matter since it was purely optional (`*`). The important thing is gobbling up the `#` to prevent it from stealing relevance. And `\s+` gets that done on it's own. * fix(markdown) fix poly backtracking issue - Fix poly backtracking with code blocks * fix(dsconfig) fix poly backtracking issue Not 100% sure what the original intent of \S was since it seems that valueless properties wouldn't have a trailing `:` and why should we include the letter after as part of the match? That means that now there is an edge case that we handle slightly differently property:"quoted value" The `:` will not be highlighted (as it was before). This is a problem for another day when someone who knows about dsconfig shows up. * fix(x86asm) fix poly backtracking issue * fix(yaml) fix poly backtracking issue * fix(livecodeserver) fix poly backtracking issue * fix(fortran) fix poly backtracking issue * fix(fortran/irpf90) fix poly backtracking issue * fix(ruby) fix poly backtracking issues - Fix poly backtracking issue with RVM_PROMPT. - Fix poly backtracking issue with heredocs. * fix(ebnf) fix poly backtracking issue * fix(basic) fix poly backtracking issue * fix(elixir) fix poly backtracking issue * fix(crystal) fix poly backtracking issue * fix(scilab) fix poly backtracking issue * fix(csharp) fix poly backtracking issue * fix(coffee/livescript) fix poly backtracking issue - Fix issue with optional params for anonymous functions * fix(moonscript) fix poly backtracking issue * fix(aspectj) fix poly backtracking issue * fix(d) fix poly backtracking issue * fix(gcode) fix poly backtracking issue * fix(kotlin) fix poly backtracking issue No explanation for what this `illegal` is trying to accomplish to without that data, just remove it. * fix(kotlin) fix poly backtracking issue - Use same numeric mode rules as for Java * fix(asciidoc) fix poly backtracking issue * fix(javascript/typescript) fix poly backtracking issue - Fix poly backtracking issue in gnarly `()` counting regex * fix(latex) fix poly backtracking issue * fix(reasonml) fix poly backtracking issue - fix typo/bug with using `s` vs `\s` (string vs regex mistake) - simply `[pattern]?[pattern]?` to just `[pattern]{0,2}` - fix ambiguous `\s*` poly issues * enh(ci): Add tests for polynomial regex issues --- CHANGES.md | 47 +++ package-lock.json | 9 + package.json | 1 + src/languages/asciidoc.js | 2 +- src/languages/aspectj.js | 2 +- src/languages/basic.js | 2 +- src/languages/c-like.js | 179 +++++++---- src/languages/coffeescript.js | 6 +- src/languages/crystal.js | 6 +- src/languages/csharp.js | 4 +- src/languages/d.js | 2 +- src/languages/dsconfig.js | 4 +- src/languages/ebnf.js | 2 +- src/languages/elixir.js | 2 +- src/languages/erlang-repl.js | 8 +- src/languages/fortran.js | 18 +- src/languages/gams.js | 11 +- src/languages/gcode.js | 12 +- src/languages/handlebars.js | 27 +- src/languages/irpf90.js | 29 +- src/languages/java.js | 38 +-- src/languages/javascript.js | 8 +- src/languages/jboss-cli.js | 2 +- src/languages/kotlin.js | 23 +- src/languages/latex.js | 2 +- src/languages/lib/java.js | 35 ++ src/languages/livecodeserver.js | 2 +- src/languages/livescript.js | 6 +- src/languages/markdown.js | 4 +- src/languages/moonscript.js | 6 +- src/languages/perl.js | 113 ++++--- src/languages/powershell.js | 2 +- src/languages/r.js | 2 +- src/languages/reasonml.js | 4 +- src/languages/routeros.js | 2 +- src/languages/ruby.js | 4 +- src/languages/ruleslanguage.js | 2 +- src/languages/scilab.js | 7 +- src/languages/sqf.js | 2 +- src/languages/x86asm.js | 2 +- src/languages/xquery.js | 2 +- src/languages/yaml.js | 2 +- src/lib/regex.js | 8 + test/index.js | 3 + test/markup/dsconfig/default.expect.txt | 10 +- test/markup/gcode/default.expect.txt | 4 +- test/regex/index.js | 407 ++++++++++++++++++++++++ test/regex/lib/analysis.js | 87 +++++ test/regex/lib/util.js | 107 +++++++ 49 files changed, 1034 insertions(+), 235 deletions(-) create mode 100644 src/languages/lib/java.js create mode 100644 test/regex/index.js create mode 100644 test/regex/lib/analysis.js create mode 100644 test/regex/lib/util.js diff --git a/CHANGES.md b/CHANGES.md index 43897a9f94..eab24c41a3 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -29,6 +29,53 @@ Grammar improvements: [tripleee]: https://github.com/tripleee +## Version 10.4.1 (tentative) + +Security + +- (fix) Exponential backtracking fixes for: [Josh Goebel][] + - cpp + - handlebars + - gams + - perl + - jboss-cli + - r + - erlang-repl + - powershell + - routeros +- (fix) Polynomial backtracking fixes for: [Josh Goebel][] + - asciidoc + - reasonml + - latex + - kotlin + - gcode + - d + - aspectj + - moonscript + - coffeescript/livescript + - csharp + - scilab + - crystal + - elixir + - basic + - ebnf + - ruby + - fortran/irpf90 + - livecodeserver + - yaml + - x86asm + - dsconfig + - markdown + - ruleslanguage + - xquery + - sqf + +Very grateful to [Michael Schmidt][] for all the help. + +[Michael Schmidt]: https://github.com/RunDevelopment +[Josh Goebel]: https://github.com/joshgoebel + + ## Version 10.4.0 A largish release with many improvements and fixes from quite a few different contributors. Enjoy! diff --git a/package-lock.json b/package-lock.json index ac10403957..f8e215cf11 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2792,6 +2792,15 @@ } } }, + "refa": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/refa/-/refa-0.4.1.tgz", + "integrity": "sha512-rdnT/f2p8aMPSTG7d7RCjKdymAnFXiByt2ULDjGdX7eFQaHgnwqfk0o2aSxUBxgkrh0DUk7wkQkujQPTgXqHpw==", + "dev": true, + "requires": { + "regexpp": "^3.1.0" + } + }, "regexpp": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-3.1.0.tgz", diff --git a/package.json b/package.json index ae9512e143..bcc275298e 100644 --- a/package.json +++ b/package.json @@ -64,6 +64,7 @@ "jsdom": "^16.4.0", "lodash": "^4.17.20", "mocha": "^8.2.1", + "refa": "^0.4.1", "rollup": "^2.33.1", "should": "^13.2.3", "terser": "^5.3.8", diff --git a/src/languages/asciidoc.js b/src/languages/asciidoc.js index e24b5194a0..5d4df1a450 100644 --- a/src/languages/asciidoc.js +++ b/src/languages/asciidoc.js @@ -172,7 +172,7 @@ export default function(hljs) { }, // images and links { - begin: '(link:)?(http|https|ftp|file|irc|image:?):\\S+\\[.*?\\]', + begin: '(link:)?(http|https|ftp|file|irc|image:?):\\S+?\\[[^[]*?\\]', returnBegin: true, contains: [ { diff --git a/src/languages/aspectj.js b/src/languages/aspectj.js index 56c0547f38..00d8f8f82c 100644 --- a/src/languages/aspectj.js +++ b/src/languages/aspectj.js @@ -115,7 +115,7 @@ export default function(hljs) { { // the function class is a bit different for AspectJ compared to the Java language className: 'function', - begin: /\w+ +\w+(\.)?\w+\s*\([^\)]*\)\s*((throws)[\w\s,]+)?[\{;]/, + begin: /\w+ +\w+(\.\w+)?\s*\([^\)]*\)\s*((throws)[\w\s,]+)?[\{;]/, returnBegin: true, end: /[{;=]/, keywords: KEYWORDS, diff --git a/src/languages/basic.js b/src/languages/basic.js index 1689236341..59606d7555 100644 --- a/src/languages/basic.js +++ b/src/languages/basic.js @@ -45,7 +45,7 @@ export default function(hljs) { { // Match typed numeric constants (1000, 12.34!, 1.2e5, 1.5#, 1.2D2) className: 'number', - begin: '\\b([0-9]+[0-9edED\.]*[#\!]?)', + begin: '\\b\\d+(\\.\\d+)?([edED]\\d+)?[#\!]?', relevance: 0 }, { diff --git a/src/languages/c-like.js b/src/languages/c-like.js index 50bd0632ff..5363076eb1 100644 --- a/src/languages/c-like.js +++ b/src/languages/c-like.js @@ -12,64 +12,77 @@ change in v10 and don't have to change the requirements again later. See: https://github.com/highlightjs/highlight.js/issues/2146 */ +import * as regex from '../lib/regex.js'; + /** @type LanguageFn */ export default function(hljs) { - function optional(s) { - return '(?:' + s + ')?'; - } // added for historic reasons because `hljs.C_LINE_COMMENT_MODE` does // not include such support nor can we be sure all the grammars depending // on it would desire this behavior - var C_LINE_COMMENT_MODE = hljs.COMMENT('//', '$', { - contains: [{begin: /\\\n/}] + const C_LINE_COMMENT_MODE = hljs.COMMENT('//', '$', { + contains: [ + { + begin: /\\\n/ + } + ] }); - var DECLTYPE_AUTO_RE = 'decltype\\(auto\\)' - var NAMESPACE_RE = '[a-zA-Z_]\\w*::' - var TEMPLATE_ARGUMENT_RE = '<.*?>'; - var FUNCTION_TYPE_RE = '(' + + const DECLTYPE_AUTO_RE = 'decltype\\(auto\\)'; + const NAMESPACE_RE = '[a-zA-Z_]\\w*::'; + const TEMPLATE_ARGUMENT_RE = '<[^<>]+>'; + const FUNCTION_TYPE_RE = '(' + DECLTYPE_AUTO_RE + '|' + - optional(NAMESPACE_RE) +'[a-zA-Z_]\\w*' + optional(TEMPLATE_ARGUMENT_RE) + + regex.optional(NAMESPACE_RE) + + '[a-zA-Z_]\\w*' + regex.optional(TEMPLATE_ARGUMENT_RE) + ')'; - var CPP_PRIMITIVE_TYPES = { + const CPP_PRIMITIVE_TYPES = { className: 'keyword', begin: '\\b[a-z\\d_]*_t\\b' }; // https://en.cppreference.com/w/cpp/language/escape // \\ \x \xFF \u2837 \u00323747 \374 - var CHARACTER_ESCAPES = '\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)' - var STRINGS = { + const CHARACTER_ESCAPES = '\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)'; + const STRINGS = { className: 'string', variants: [ { - begin: '(u8?|U|L)?"', end: '"', + begin: '(u8?|U|L)?"', + end: '"', illegal: '\\n', - contains: [hljs.BACKSLASH_ESCAPE] + contains: [ hljs.BACKSLASH_ESCAPE ] }, { - begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", end: '\'', + begin: '(u8?|U|L)?\'(' + CHARACTER_ESCAPES + "|.)", + end: '\'', illegal: '.' }, hljs.END_SAME_AS_BEGIN({ begin: /(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/, - end: /\)([^()\\ ]{0,16})"/, + end: /\)([^()\\ ]{0,16})"/ }) ] }; - var NUMBERS = { + const NUMBERS = { className: 'number', variants: [ - { begin: '\\b(0b[01\']+)' }, - { begin: '(-?)\\b([\\d\']+(\\.[\\d\']*)?|\\.[\\d\']+)(u|U|l|L|ul|UL|f|F|b|B)' }, - { begin: '(-?)(\\b0[xX][a-fA-F0-9\']+|(\\b[\\d\']+(\\.[\\d\']*)?|\\.[\\d\']+)([eE][-+]?[\\d\']+)?)' } + { + begin: '\\b(0b[01\']+)' + }, + { + begin: '(-?)\\b([\\d\']+(\\.[\\d\']*)?|\\.[\\d\']+)(u|U|l|L|ul|UL|f|F|b|B)' + }, + { + begin: '(-?)(\\b0[xX][a-fA-F0-9\']+|(\\b[\\d\']+(\\.[\\d\']*)?|\\.[\\d\']+)([eE][-+]?[\\d\']+)?)' + } ], relevance: 0 }; - var PREPROCESSOR = { + const PREPROCESSOR = { className: 'meta', - begin: /#\s*[a-z]+\b/, end: /$/, + begin: /#\s*[a-z]+\b/, + end: /$/, keywords: { 'meta-keyword': 'if else elif endif define undef warning error line ' + @@ -77,28 +90,32 @@ export default function(hljs) { }, contains: [ { - begin: /\\\n/, relevance: 0 + begin: /\\\n/, + relevance: 0 }, - hljs.inherit(STRINGS, {className: 'meta-string'}), + hljs.inherit(STRINGS, { + className: 'meta-string' + }), { className: 'meta-string', - begin: /<.*?>/, end: /$/, - illegal: '\\n', + begin: /<.*?>/, + end: /$/, + illegal: '\\n' }, C_LINE_COMMENT_MODE, hljs.C_BLOCK_COMMENT_MODE ] }; - var TITLE_MODE = { + const TITLE_MODE = { className: 'title', - begin: optional(NAMESPACE_RE) + hljs.IDENT_RE, + begin: regex.optional(NAMESPACE_RE) + hljs.IDENT_RE, relevance: 0 }; - var FUNCTION_TITLE = optional(NAMESPACE_RE) + hljs.IDENT_RE + '\\s*\\('; + const FUNCTION_TITLE = regex.optional(NAMESPACE_RE) + hljs.IDENT_RE + '\\s*\\('; - var CPP_KEYWORDS = { + const CPP_KEYWORDS = { keyword: 'int float while private char char8_t char16_t char32_t catch import module export virtual operator sizeof ' + 'dynamic_cast|10 typedef const_cast|10 const for static_cast|10 union namespace ' + 'unsigned long volatile static protected bool template mutable if public friend ' + @@ -123,7 +140,7 @@ export default function(hljs) { literal: 'true false nullptr NULL' }; - var EXPRESSION_CONTAINS = [ + const EXPRESSION_CONTAINS = [ PREPROCESSOR, CPP_PRIMITIVE_TYPES, C_LINE_COMMENT_MODE, @@ -132,49 +149,61 @@ export default function(hljs) { STRINGS ]; - var EXPRESSION_CONTEXT = { + const EXPRESSION_CONTEXT = { // This mode covers expression context where we can't expect a function // definition and shouldn't highlight anything that looks like one: // `return some()`, `else if()`, `(x*sum(1, 2))` variants: [ - {begin: /=/, end: /;/}, - {begin: /\(/, end: /\)/}, - {beginKeywords: 'new throw return else', end: /;/} + { + begin: /=/, + end: /;/ + }, + { + begin: /\(/, + end: /\)/ + }, + { + beginKeywords: 'new throw return else', + end: /;/ + } ], keywords: CPP_KEYWORDS, contains: EXPRESSION_CONTAINS.concat([ { - begin: /\(/, end: /\)/, + begin: /\(/, + end: /\)/, keywords: CPP_KEYWORDS, - contains: EXPRESSION_CONTAINS.concat(['self']), + contains: EXPRESSION_CONTAINS.concat([ 'self' ]), relevance: 0 } ]), relevance: 0 }; - var FUNCTION_DECLARATION = { + const FUNCTION_DECLARATION = { className: 'function', begin: '(' + FUNCTION_TYPE_RE + '[\\*&\\s]+)+' + FUNCTION_TITLE, - returnBegin: true, end: /[{;=]/, + returnBegin: true, + end: /[{;=]/, excludeEnd: true, keywords: CPP_KEYWORDS, illegal: /[^\w\s\*&:<>]/, contains: [ - { // to prevent it from being confused as the function title begin: DECLTYPE_AUTO_RE, keywords: CPP_KEYWORDS, - relevance: 0, + relevance: 0 }, { - begin: FUNCTION_TITLE, returnBegin: true, - contains: [TITLE_MODE], + begin: FUNCTION_TITLE, + returnBegin: true, + contains: [ TITLE_MODE ], relevance: 0 }, { className: 'params', - begin: /\(/, end: /\)/, + begin: /\(/, + end: /\)/, keywords: CPP_KEYWORDS, relevance: 0, contains: [ @@ -185,7 +214,8 @@ export default function(hljs) { CPP_PRIMITIVE_TYPES, // Count matching parentheses. { - begin: /\(/, end: /\)/, + begin: /\(/, + end: /\)/, keywords: CPP_KEYWORDS, relevance: 0, contains: [ @@ -207,7 +237,17 @@ export default function(hljs) { }; return { - aliases: ['c', 'cc', 'h', 'c++', 'h++', 'hpp', 'hh', 'hxx', 'cxx'], + aliases: [ + 'c', + 'cc', + 'h', + 'c++', + 'h++', + 'hpp', + 'hh', + 'hxx', + 'cxx' + ], keywords: CPP_KEYWORDS, // the base c-like language will NEVER be auto-detected, rather the // derivitives: c, c++, arduino turn auto-detect back on for themselves @@ -218,25 +258,32 @@ export default function(hljs) { FUNCTION_DECLARATION, EXPRESSION_CONTAINS, [ - PREPROCESSOR, - { // containers: ie, `vector rooms (9);` - begin: '\\b(deque|list|queue|priority_queue|pair|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<', end: '>', - keywords: CPP_KEYWORDS, - contains: ['self', CPP_PRIMITIVE_TYPES] - }, - { - begin: hljs.IDENT_RE + '::', - keywords: CPP_KEYWORDS - }, - { - className: 'class', - beginKeywords: 'enum class struct union', end: /[{;:<>=]/, - contains: [ - { beginKeywords: "final class struct" }, - hljs.TITLE_MODE - ] - } - ]), + PREPROCESSOR, + { // containers: ie, `vector rooms (9);` + begin: '\\b(deque|list|queue|priority_queue|pair|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<', + end: '>', + keywords: CPP_KEYWORDS, + contains: [ + 'self', + CPP_PRIMITIVE_TYPES + ] + }, + { + begin: hljs.IDENT_RE + '::', + keywords: CPP_KEYWORDS + }, + { + className: 'class', + beginKeywords: 'enum class struct union', + end: /[{;:<>=]/, + contains: [ + { + beginKeywords: "final class struct" + }, + hljs.TITLE_MODE + ] + } + ]), exports: { preprocessor: PREPROCESSOR, strings: STRINGS, diff --git a/src/languages/coffeescript.js b/src/languages/coffeescript.js index e5b4e67735..1fc1e1d513 100644 --- a/src/languages/coffeescript.js +++ b/src/languages/coffeescript.js @@ -140,7 +140,7 @@ export default function(hljs) { const TITLE = hljs.inherit(hljs.TITLE_MODE, { begin: JS_IDENT_RE }); - const PARAMS_RE = '(\\(.*\\))?\\s*\\B[-=]>'; + const POSSIBLE_PARAMS_RE = '(\\(.*\\)\\s*)?\\B[-=]>'; const PARAMS = { className: 'params', begin: '\\([^\\(]', @@ -169,7 +169,7 @@ export default function(hljs) { hljs.HASH_COMMENT_MODE, { className: 'function', - begin: '^\\s*' + JS_IDENT_RE + '\\s*=\\s*' + PARAMS_RE, + begin: '^\\s*' + JS_IDENT_RE + '\\s*=\\s*' + POSSIBLE_PARAMS_RE, end: '[-=]>', returnBegin: true, contains: [ @@ -183,7 +183,7 @@ export default function(hljs) { relevance: 0, contains: [{ className: 'function', - begin: PARAMS_RE, + begin: POSSIBLE_PARAMS_RE, end: '[-=]>', returnBegin: true, contains: [PARAMS] diff --git a/src/languages/crystal.js b/src/languages/crystal.js index 7f4c5b0834..1e5cbdabbf 100644 --- a/src/languages/crystal.js +++ b/src/languages/crystal.js @@ -6,8 +6,8 @@ Website: https://crystal-lang.org /** @type LanguageFn */ export default function(hljs) { - var INT_SUFFIX = '(_*[ui](8|16|32|64|128))?'; - var FLOAT_SUFFIX = '(_*f(32|64))?'; + var INT_SUFFIX = '(_?[ui](8|16|32|64|128))?'; + var FLOAT_SUFFIX = '(_?f(32|64))?'; var CRYSTAL_IDENT_RE = '[a-zA-Z_]\\w*[!?=]?'; var CRYSTAL_METHOD_RE = '[a-zA-Z_]\\w*[!?=]?|[-+~]@|<<|>>|[=!]~|===?|<=>|[<>]=?|\\*\\*|[-/+%^&*~|]|//|//=|&[-+*]=?|&\\*\\*|\\[\\][=?]?'; var CRYSTAL_PATH_RE = '[A-Za-z_]\\w*(::\\w+)*(\\?|!)?'; @@ -177,7 +177,7 @@ export default function(hljs) { { begin: '\\b0b([01_]+)' + INT_SUFFIX }, { begin: '\\b0o([0-7_]+)' + INT_SUFFIX }, { begin: '\\b0x([A-Fa-f0-9_]+)' + INT_SUFFIX }, - { begin: '\\b([1-9][0-9_]*[0-9]|[0-9])(\\.[0-9][0-9_]*)?([eE]_*[-+]?[0-9_]*)?' + FLOAT_SUFFIX + '(?!_)' }, + { begin: '\\b([1-9][0-9_]*[0-9]|[0-9])(\\.[0-9][0-9_]*)?([eE]_?[-+]?[0-9_]*)?' + FLOAT_SUFFIX + '(?!_)' }, { begin: '\\b([1-9][0-9_]*|0)' + INT_SUFFIX } ], relevance: 0 diff --git a/src/languages/csharp.js b/src/languages/csharp.js index 7e002ee832..7c12f93445 100644 --- a/src/languages/csharp.js +++ b/src/languages/csharp.js @@ -325,7 +325,7 @@ export default function(hljs) { }, { className: 'function', - begin: '(' + TYPE_IDENT_RE + '\\s+)+' + hljs.IDENT_RE + '\\s*(<.+>)?\\s*\\(', returnBegin: true, + begin: '(' + TYPE_IDENT_RE + '\\s+)+' + hljs.IDENT_RE + '\\s*(<.+>\\s*)?\\(', returnBegin: true, end: /\s*[{;=]/, excludeEnd: true, keywords: KEYWORDS, contains: [ @@ -335,7 +335,7 @@ export default function(hljs) { relevance: 0 }, { - begin: hljs.IDENT_RE + '\\s*(<.+>)?\\s*\\(', returnBegin: true, + begin: hljs.IDENT_RE + '\\s*(<.+>\\s*)?\\(', returnBegin: true, contains: [ hljs.TITLE_MODE, GENERIC_MODIFIER diff --git a/src/languages/d.js b/src/languages/d.js index 239fb59a41..c691632efc 100644 --- a/src/languages/d.js +++ b/src/languages/d.js @@ -62,7 +62,7 @@ export default function(hljs) { const decimal_exponent_re = '([eE][+-]?' + decimal_integer_nosus_re + ')'; const decimal_float_re = '(' + decimal_integer_nosus_re + '(\\.\\d*|' + decimal_exponent_re + ')|' + - '\\d+\\.' + decimal_integer_nosus_re + decimal_integer_nosus_re + '|' + + '\\d+\\.' + decimal_integer_nosus_re + '|' + '\\.' + decimal_integer_re + decimal_exponent_re + '?' + ')'; const hexadecimal_float_re = '(0[xX](' + diff --git a/src/languages/dsconfig.js b/src/languages/dsconfig.js index 666c31076c..eeff8d7edb 100644 --- a/src/languages/dsconfig.js +++ b/src/languages/dsconfig.js @@ -25,8 +25,8 @@ export default function(hljs) { }; const VALUELESS_PROPERTY = { className: 'string', - begin: /\w+-?\w+/, - end: /\W/, + begin: /\w+(\-\w+)*/, + end: /(?=\W)/, relevance: 0 }; diff --git a/src/languages/ebnf.js b/src/languages/ebnf.js index b5816e96b6..78550ac8b3 100644 --- a/src/languages/ebnf.js +++ b/src/languages/ebnf.js @@ -10,7 +10,7 @@ export default function(hljs) { const nonTerminalMode = { className: "attribute", - begin: /^[ ]*[a-zA-Z][a-zA-Z_-]*([\s_-]+[a-zA-Z][a-zA-Z]*)*/ + begin: /^[ ]*[a-zA-Z]+([\s_-]+[a-zA-Z]+)*/ }; const specialSequenceMode = { diff --git a/src/languages/elixir.js b/src/languages/elixir.js index 76a1c9a92c..a584b37a18 100644 --- a/src/languages/elixir.js +++ b/src/languages/elixir.js @@ -24,7 +24,7 @@ export default function(hljs) { }; const NUMBER = { className: 'number', - begin: '(\\b0o[0-7_]+)|(\\b0b[01_]+)|(\\b0x[0-9a-fA-F_]+)|(-?\\b[1-9][0-9_]*(.[0-9_]+([eE][-+]?[0-9]+)?)?)', + begin: '(\\b0o[0-7_]+)|(\\b0b[01_]+)|(\\b0x[0-9a-fA-F_]+)|(-?\\b[1-9][0-9_]*(\\.[0-9_]+([eE][-+]?[0-9]+)?)?)', relevance: 0 }; const SIGIL_DELIMITERS = '[/|([{<"\']'; diff --git a/src/languages/erlang-repl.js b/src/languages/erlang-repl.js index df3eecf83b..97f9642e14 100644 --- a/src/languages/erlang-repl.js +++ b/src/languages/erlang-repl.js @@ -5,6 +5,8 @@ Website: https://www.erlang.org Category: functional */ +import * as regex from '../lib/regex.js'; + /** @type LanguageFn */ export default function(hljs) { return { @@ -31,7 +33,11 @@ export default function(hljs) { hljs.APOS_STRING_MODE, hljs.QUOTE_STRING_MODE, { - begin: '\\?(::)?([A-Z]\\w*(::)?)+' + begin: regex.concat( + /\?(::)?/, + /([A-Z]\w*)/, // at least one identifier + /((::)[A-Z]\w*)*/ // perhaps more + ) }, { begin: '->' diff --git a/src/languages/fortran.js b/src/languages/fortran.js index 3a1c3b74bf..0bd5d0012e 100644 --- a/src/languages/fortran.js +++ b/src/languages/fortran.js @@ -5,6 +5,8 @@ Website: https://en.wikipedia.org/wiki/Fortran Category: scientific */ +import * as regex from '../lib/regex.js'; + /** @type LanguageFn */ export default function(hljs) { const PARAMS = { @@ -28,10 +30,22 @@ export default function(hljs) { ] }; + // regex in both fortran and irpf90 should match + const OPTIONAL_NUMBER_SUFFIX = /(_[a-z_\d]+)?/; + const OPTIONAL_NUMBER_EXP = /([de][+-]?\d+)?/; const NUMBER = { className: 'number', - // regex in both fortran and irpf90 should match - begin: '(?=\\b|\\+|-|\\.)(?:\\.|\\d+\\.?)\\d*([de][+-]?\\d+)?(_[a-z_\\d]+)?', + variants: [ + { + begin: regex.concat(/\b\d+/, /\.(\d*)/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + }, + { + begin: regex.concat(/\b\d+/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + }, + { + begin: regex.concat(/\.\d+/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + } + ], relevance: 0 }; diff --git a/src/languages/gams.js b/src/languages/gams.js index 2fceeff5a7..fc926c4b8f 100644 --- a/src/languages/gams.js +++ b/src/languages/gams.js @@ -8,6 +8,9 @@ Category: scientific */ +import * as regex from '../lib/regex.js'; + +/** @type LanguageFn */ export default function(hljs) { const KEYWORDS = { keyword: @@ -80,6 +83,7 @@ export default function(hljs) { hljs.C_NUMBER_MODE ] }; + const COMMENT_WORD = /[a-z0-9&#*=?@\\><:,()$[\]_.{}!+%^-]+/; const DESCTEXT = { // Parameter/set/variable description text begin: /[a-z][a-z0-9_]*(\([a-z0-9_, ]*\))?[ \t]+/, excludeBegin: true, @@ -90,7 +94,12 @@ export default function(hljs) { ASSIGNMENT, { className: 'comment', - begin: /([ ]*[a-z0-9&#*=?@\\><:,()$[\]_.{}!+%^-]+)+/, + // one comment word, then possibly more + begin: regex.concat( + COMMENT_WORD, + // [ ] because \s would be too broad (matching newlines) + regex.anyNumberOfTimes(regex.concat(/[ ]+/, COMMENT_WORD)) + ), relevance: 0 } ] diff --git a/src/languages/gcode.js b/src/languages/gcode.js index cdac312c64..0e39a596ff 100644 --- a/src/languages/gcode.js +++ b/src/languages/gcode.js @@ -17,13 +17,14 @@ export default function(hljs) { className: 'meta', begin: '([O])([0-9]+)' }; + const NUMBER = hljs.inherit(hljs.C_NUMBER_MODE, { + begin: '([-+]?((\\.\\d+)|(\\d+)(\\.\\d*)?))|' + hljs.C_NUMBER_RE + }); const GCODE_CODE = [ hljs.C_LINE_COMMENT_MODE, hljs.C_BLOCK_COMMENT_MODE, hljs.COMMENT(/\(/, /\)/), - hljs.inherit(hljs.C_NUMBER_MODE, { - begin: '([-+]?([0-9]*\\.?[0-9]+\\.?))|' + hljs.C_NUMBER_RE - }), + NUMBER, hljs.inherit(hljs.APOS_STRING_MODE, { illegal: null }), @@ -50,7 +51,10 @@ export default function(hljs) { { className: 'built_in', begin: '(ATAN|ABS|ACOS|ASIN|SIN|COS|EXP|FIX|FUP|ROUND|LN|TAN)(\\[)', - end: '([-+]?([0-9]*\\.?[0-9]+\\.?))(\\])' + contains: [ + NUMBER + ], + end: '\\]' }, { className: 'symbol', diff --git a/src/languages/handlebars.js b/src/languages/handlebars.js index 092662504d..16218111ac 100644 --- a/src/languages/handlebars.js +++ b/src/languages/handlebars.js @@ -57,20 +57,25 @@ export default function(hljs) { // this regex matches literal segments like ' abc ' or [ abc ] as well as helpers and paths // like a/b, ./abc/cde, and abc.bcd - const DOUBLE_QUOTED_ID_REGEX = /".*?"/; - const SINGLE_QUOTED_ID_REGEX = /'.*?'/; - const BRACKET_QUOTED_ID_REGEX = /\[.*?\]/; + const DOUBLE_QUOTED_ID_REGEX = /""|"[^"]+"/; + const SINGLE_QUOTED_ID_REGEX = /''|'[^']+'/; + const BRACKET_QUOTED_ID_REGEX = /\[\]|\[[^\]]+\]/; const PLAIN_ID_REGEX = /[^\s!"#%&'()*+,.\/;<=>@\[\\\]^`{|}~]+/; - const PATH_DELIMITER_REGEX = /\.|\//; + const PATH_DELIMITER_REGEX = /(\.|\/)/; + const ANY_ID = regex.either( + DOUBLE_QUOTED_ID_REGEX, + SINGLE_QUOTED_ID_REGEX, + BRACKET_QUOTED_ID_REGEX, + PLAIN_ID_REGEX + ); const IDENTIFIER_REGEX = regex.concat( - '(', - SINGLE_QUOTED_ID_REGEX, '|', - DOUBLE_QUOTED_ID_REGEX, '|', - BRACKET_QUOTED_ID_REGEX, '|', - PLAIN_ID_REGEX, '|', - PATH_DELIMITER_REGEX, - ')+' + regex.optional(/\.|\.\/|\//), // relative or absolute path + ANY_ID, + regex.anyNumberOfTimes(regex.concat( + PATH_DELIMITER_REGEX, + ANY_ID + )) ); // identifier followed by a equal-sign (without the equal sign) diff --git a/src/languages/irpf90.js b/src/languages/irpf90.js index 76ef4bcb1f..914c58dd39 100644 --- a/src/languages/irpf90.js +++ b/src/languages/irpf90.js @@ -6,6 +6,9 @@ Website: http://irpf90.ups-tlse.fr Category: scientific */ +import * as regex from '../lib/regex.js'; + +/** @type LanguageFn */ export default function(hljs) { const PARAMS = { className: 'params', @@ -13,6 +16,25 @@ export default function(hljs) { end: '\\)' }; + // regex in both fortran and irpf90 should match + const OPTIONAL_NUMBER_SUFFIX = /(_[a-z_\d]+)?/; + const OPTIONAL_NUMBER_EXP = /([de][+-]?\d+)?/; + const NUMBER = { + className: 'number', + variants: [ + { + begin: regex.concat(/\b\d+/, /\.(\d*)/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + }, + { + begin: regex.concat(/\b\d+/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + }, + { + begin: regex.concat(/\.\d+/, OPTIONAL_NUMBER_EXP, OPTIONAL_NUMBER_SUFFIX) + } + ], + relevance: 0 + }; + const F_KEYWORDS = { literal: '.False. .True.', keyword: 'kind do while private call intrinsic where elsewhere ' + @@ -88,12 +110,7 @@ export default function(hljs) { hljs.COMMENT('begin_doc', 'end_doc', { relevance: 10 }), - { - className: 'number', - // regex in both fortran and irpf90 should match - begin: '(?=\\b|\\+|-|\\.)(?:\\.|\\d+\\.?)\\d*([de][+-]?\\d+)?(_[a-z_\\d]+)?', - relevance: 0 - } + NUMBER ] }; } diff --git a/src/languages/java.js b/src/languages/java.js index 9b7c83fb7a..8cf86c43cc 100644 --- a/src/languages/java.js +++ b/src/languages/java.js @@ -5,6 +5,8 @@ Category: common, enterprise Website: https://www.java.com/ */ +import { NUMERIC } from "./lib/java.js"; + export default function(hljs) { var JAVA_IDENT_RE = '[\u00C0-\u02B8a-zA-Z_$][\u00C0-\u02B8a-zA-Z_$0-9]*'; var GENERIC_IDENT_RE = JAVA_IDENT_RE + '(<' + JAVA_IDENT_RE + '(\\s*,\\s*' + JAVA_IDENT_RE + ')*>)?'; @@ -25,41 +27,7 @@ export default function(hljs) { }, ] }; - - // https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10 - var decimalDigits = '[0-9](_*[0-9])*'; - var frac = `\\.(${decimalDigits})`; - var hexDigits = '[0-9a-fA-F](_*[0-9a-fA-F])*'; - var NUMBER = { - className: 'number', - variants: [ - // DecimalFloatingPointLiteral - // including ExponentPart - { begin: `(\\b(${decimalDigits})((${frac})|\\.)?|(${frac}))` + - `[eE][+-]?(${decimalDigits})[fFdD]?\\b` }, - // excluding ExponentPart - { begin: `\\b(${decimalDigits})((${frac})[fFdD]?\\b|\\.([fFdD]\\b)?)` }, - { begin: `(${frac})[fFdD]?\\b` }, - { begin: `\\b(${decimalDigits})[fFdD]\\b` }, - - // HexadecimalFloatingPointLiteral - { begin: `\\b0[xX]((${hexDigits})\\.?|(${hexDigits})?\\.(${hexDigits}))` + - `[pP][+-]?(${decimalDigits})[fFdD]?\\b` }, - - // DecimalIntegerLiteral - { begin: '\\b(0|[1-9](_*[0-9])*)[lL]?\\b' }, - - // HexIntegerLiteral - { begin: `\\b0[xX](${hexDigits})[lL]?\\b` }, - - // OctalIntegerLiteral - { begin: '\\b0(_*[0-7])*[lL]?\\b' }, - - // BinaryIntegerLiteral - { begin: '\\b0[bB][01](_*[01])*[lL]?\\b' }, - ], - relevance: 0 - }; + const NUMBER = NUMERIC; return { name: 'Java', diff --git a/src/languages/javascript.js b/src/languages/javascript.js index be4080ffe0..eda72868d9 100644 --- a/src/languages/javascript.js +++ b/src/languages/javascript.js @@ -282,8 +282,8 @@ export default function(hljs) { '[^()]*(\\(' + '[^()]*(\\(' + '[^()]*' + - '\\))*[^()]*' + - '\\))*[^()]*' + + '\\)[^()]*)*' + + '\\)[^()]*)*' + '\\)|' + hljs.UNDERSCORE_IDENT_RE + ')\\s*=>', returnBegin: true, end: '\\s*=>', @@ -373,8 +373,8 @@ export default function(hljs) { '[^()]*(\\(' + '[^()]*(\\(' + '[^()]*' + - '\\))*[^()]*' + - '\\))*[^()]*' + + '\\)[^()]*)*' + + '\\)[^()]*)*' + '\\)\\s*\\{', // end parens returnBegin:true, contains: [ diff --git a/src/languages/jboss-cli.js b/src/languages/jboss-cli.js index 488740cd3b..e2ca0d25bc 100644 --- a/src/languages/jboss-cli.js +++ b/src/languages/jboss-cli.js @@ -32,7 +32,7 @@ export default function(hljs) { }; const PATH = { className: 'string', - begin: /\B(([\/.])[\w\-.\/=]+)+/ + begin: /\B([\/.])[\w\-.\/=]+/ }; const COMMAND_PARAMS = { className: 'params', diff --git a/src/languages/kotlin.js b/src/languages/kotlin.js index 78eabf6eed..16b8ef785e 100644 --- a/src/languages/kotlin.js +++ b/src/languages/kotlin.js @@ -6,6 +6,8 @@ Category: common */ +import { NUMERIC } from "./lib/java.js"; + export default function(hljs) { const KEYWORDS = { keyword: @@ -104,25 +106,7 @@ export default function(hljs) { // https://kotlinlang.org/docs/reference/whatsnew11.html#underscores-in-numeric-literals // According to the doc above, the number mode of kotlin is the same as java 8, // so the code below is copied from java.js - const KOTLIN_NUMBER_RE = '\\b' + - '(' + - '0[bB]([01]+[01_]+[01]+|[01]+)' + // 0b... - '|' + - '0[xX]([a-fA-F0-9]+[a-fA-F0-9_]+[a-fA-F0-9]+|[a-fA-F0-9]+)' + // 0x... - '|' + - '(' + - '([\\d]+[\\d_]+[\\d]+|[\\d]+)(\\.([\\d]+[\\d_]+[\\d]+|[\\d]+))?' + - '|' + - '\\.([\\d]+[\\d_]+[\\d]+|[\\d]+)' + - ')' + - '([eE][-+]?\\d+)?' + // octal, decimal, float - ')' + - '[lLfF]?'; - const KOTLIN_NUMBER_MODE = { - className: 'number', - begin: KOTLIN_NUMBER_RE, - relevance: 0 - }; + const KOTLIN_NUMBER_MODE = NUMERIC; const KOTLIN_NESTED_COMMENT = hljs.COMMENT( '/\\*', '\\*/', { @@ -177,7 +161,6 @@ export default function(hljs) { returnBegin: true, excludeEnd: true, keywords: KEYWORDS, - illegal: /fun\s+(<.*>)?[^\s\(]+(\s+[^\s\(]+)\s*=/, relevance: 5, contains: [ { diff --git a/src/languages/latex.js b/src/languages/latex.js index 0cadd1a09e..47b0636de8 100644 --- a/src/languages/latex.js +++ b/src/languages/latex.js @@ -172,7 +172,7 @@ export default function(hljs) { const BEGIN_ENV = function(envname, starts_mode) { return hljs.inherit( { - begin: '\\\\begin(?=\\s*\\r?\\n?\\s*\\{' + envname + '\\})', + begin: '\\\\begin(?=[ \t]*(\\r?\\n[ \t]*)?\\{' + envname + '\\})', keywords: {$pattern: /\\[a-zA-Z]+/, keyword: '\\begin'}, relevance: 0, }, diff --git a/src/languages/lib/java.js b/src/languages/lib/java.js new file mode 100644 index 0000000000..f855643461 --- /dev/null +++ b/src/languages/lib/java.js @@ -0,0 +1,35 @@ + +// https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10 +var decimalDigits = '[0-9](_*[0-9])*'; +var frac = `\\.(${decimalDigits})`; +var hexDigits = '[0-9a-fA-F](_*[0-9a-fA-F])*'; +export var NUMERIC = { + className: 'number', + variants: [ + // DecimalFloatingPointLiteral + // including ExponentPart + { begin: `(\\b(${decimalDigits})((${frac})|\\.)?|(${frac}))` + + `[eE][+-]?(${decimalDigits})[fFdD]?\\b` }, + // excluding ExponentPart + { begin: `\\b(${decimalDigits})((${frac})[fFdD]?\\b|\\.([fFdD]\\b)?)` }, + { begin: `(${frac})[fFdD]?\\b` }, + { begin: `\\b(${decimalDigits})[fFdD]\\b` }, + + // HexadecimalFloatingPointLiteral + { begin: `\\b0[xX]((${hexDigits})\\.?|(${hexDigits})?\\.(${hexDigits}))` + + `[pP][+-]?(${decimalDigits})[fFdD]?\\b` }, + + // DecimalIntegerLiteral + { begin: '\\b(0|[1-9](_*[0-9])*)[lL]?\\b' }, + + // HexIntegerLiteral + { begin: `\\b0[xX](${hexDigits})[lL]?\\b` }, + + // OctalIntegerLiteral + { begin: '\\b0(_*[0-7])*[lL]?\\b' }, + + // BinaryIntegerLiteral + { begin: '\\b0[bB][01](_*[01])*[lL]?\\b' }, + ], + relevance: 0 +}; diff --git a/src/languages/livecodeserver.js b/src/languages/livecodeserver.js index f273a6b73e..f897dad994 100644 --- a/src/languages/livecodeserver.js +++ b/src/languages/livecodeserver.js @@ -29,7 +29,7 @@ export default function(hljs) { const TITLE1 = hljs.inherit(hljs.TITLE_MODE, { variants: [ { - begin: '\\b_*rig[A-Z]+[A-Za-z0-9_\\-]*' + begin: '\\b_*rig[A-Z][A-Za-z0-9_\\-]*' }, { begin: '\\b_[a-z0-9\\-]+' diff --git a/src/languages/livescript.js b/src/languages/livescript.js index 4e86129237..0567172d8e 100644 --- a/src/languages/livescript.js +++ b/src/languages/livescript.js @@ -194,15 +194,15 @@ export default function(hljs) { returnBegin: true, variants: [ { - begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?(\\(.*\\))?\\s*\\B->\\*?', + begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?(\\(.*\\)\\s*)?\\B->\\*?', end: '->\\*?' }, { - begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?!?(\\(.*\\))?\\s*\\B[-~]{1,2}>\\*?', + begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?!?(\\(.*\\)\\s*)?\\B[-~]{1,2}>\\*?', end: '[-~]{1,2}>\\*?' }, { - begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?(\\(.*\\))?\\s*\\B!?[-~]{1,2}>\\*?', + begin: '(' + JS_IDENT_RE + '\\s*(?:=|:=)\\s*)?(\\(.*\\)\\s*)?\\B!?[-~]{1,2}>\\*?', end: '!?[-~]{1,2}>\\*?' } ] diff --git a/src/languages/markdown.js b/src/languages/markdown.js index daa3611b96..99e90aea86 100644 --- a/src/languages/markdown.js +++ b/src/languages/markdown.js @@ -24,10 +24,10 @@ export default function(hljs) { variants: [ // TODO: fix to allow these to work with sublanguage also { - begin: '(`{3,})(.|\\n)*?\\1`*[ ]*' + begin: '(`{3,})[^`](.|\\n)*?\\1`*[ ]*' }, { - begin: '(~{3,})(.|\\n)*?\\1~*[ ]*' + begin: '(~{3,})[^~](.|\\n)*?\\1~*[ ]*' }, // needed to allow markdown as a sublanguage to work { diff --git a/src/languages/moonscript.js b/src/languages/moonscript.js index edb6947209..1713e375fc 100644 --- a/src/languages/moonscript.js +++ b/src/languages/moonscript.js @@ -70,7 +70,7 @@ export default function(hljs) { const TITLE = hljs.inherit(hljs.TITLE_MODE, { begin: JS_IDENT_RE }); - const PARAMS_RE = '(\\(.*\\))?\\s*\\B[-=]>'; + const POSSIBLE_PARAMS_RE = '(\\(.*\\)\\s*)?\\B[-=]>'; const PARAMS = { className: 'params', begin: '\\([^\\(]', @@ -96,7 +96,7 @@ export default function(hljs) { hljs.COMMENT('--', '$'), { className: 'function', // function: -> => - begin: '^\\s*' + JS_IDENT_RE + '\\s*=\\s*' + PARAMS_RE, + begin: '^\\s*' + JS_IDENT_RE + '\\s*=\\s*' + POSSIBLE_PARAMS_RE, end: '[-=]>', returnBegin: true, contains: [ @@ -110,7 +110,7 @@ export default function(hljs) { contains: [ { className: 'function', - begin: PARAMS_RE, + begin: POSSIBLE_PARAMS_RE, end: '[-=]>', returnBegin: true, contains: [ PARAMS ] diff --git a/src/languages/perl.js b/src/languages/perl.js index 881e5fbcd1..c688199789 100644 --- a/src/languages/perl.js +++ b/src/languages/perl.js @@ -9,7 +9,9 @@ import * as regex from '../lib/regex.js'; /** @type LanguageFn */ export default function(hljs) { - var PERL_KEYWORDS = { + // https://perldoc.perl.org/perlre#Modifiers + const REGEX_MODIFIERS = /[dualxmsipn]{0,12}/; // aa and xx are valid, making max length 12 + const PERL_KEYWORDS = { $pattern: /[\w.]+/, keyword: 'getpwent getservent quotemeta msgrcv scalar kill dbmclose undef lc ' + 'ma syswrite tr send umask sysopen shmwrite vec qx utime local oct semctl localtime ' + @@ -31,29 +33,42 @@ export default function(hljs) { 'ioctl socket readlink eval xor readline binmode setservent eof ord bind alarm pipe ' + 'atan2 getgrent exp time push setgrent gt lt or ne m|0 break given say state when' }; - var SUBST = { + const SUBST = { className: 'subst', - begin: '[$@]\\{', end: '\\}', + begin: '[$@]\\{', + end: '\\}', keywords: PERL_KEYWORDS }; - var METHOD = { - begin: /->\{/, end: /\}/ + const METHOD = { + begin: /->\{/, + end: /\}/ // contains defined later }; - var VAR = { + const VAR = { variants: [ - {begin: /\$\d/}, - {begin: regex.concat( - /[$%@](\^\w\b|#\w+(::\w+)*|\{\w+\}|\w+(::\w*)*)/, - // negative look-ahead tries to avoid matching patterns that are not - // Perl at all like $ident$, @ident@, etc. - `(?![A-Za-z])(?![@$%])` - )}, - {begin: /[$%@][^\s\w{]/, relevance: 0} + { + begin: /\$\d/ + }, + { + begin: regex.concat( + /[$%@](\^\w\b|#\w+(::\w+)*|\{\w+\}|\w+(::\w*)*)/, + // negative look-ahead tries to avoid matching patterns that are not + // Perl at all like $ident$, @ident@, etc. + `(?![A-Za-z])(?![@$%])` + ) + }, + { + begin: /[$%@][^\s\w{]/, + relevance: 0 + } ] }; - var STRING_CONTAINS = [hljs.BACKSLASH_ESCAPE, SUBST, VAR]; - var PERL_DEFAULT_CONTAINS = [ + const STRING_CONTAINS = [ + hljs.BACKSLASH_ESCAPE, + SUBST, + VAR + ]; + const PERL_DEFAULT_CONTAINS = [ VAR, hljs.HASH_COMMENT_MODE, hljs.COMMENT( @@ -69,39 +84,48 @@ export default function(hljs) { contains: STRING_CONTAINS, variants: [ { - begin: 'q[qwxr]?\\s*\\(', end: '\\)', + begin: 'q[qwxr]?\\s*\\(', + end: '\\)', relevance: 5 }, { - begin: 'q[qwxr]?\\s*\\[', end: '\\]', + begin: 'q[qwxr]?\\s*\\[', + end: '\\]', relevance: 5 }, { - begin: 'q[qwxr]?\\s*\\{', end: '\\}', + begin: 'q[qwxr]?\\s*\\{', + end: '\\}', relevance: 5 }, { - begin: 'q[qwxr]?\\s*\\|', end: '\\|', + begin: 'q[qwxr]?\\s*\\|', + end: '\\|', relevance: 5 }, { - begin: 'q[qwxr]?\\s*<', end: '>', + begin: 'q[qwxr]?\\s*<', + end: '>', relevance: 5 }, { - begin: 'qw\\s+q', end: 'q', + begin: 'qw\\s+q', + end: 'q', relevance: 5 }, { - begin: '\'', end: '\'', - contains: [hljs.BACKSLASH_ESCAPE] + begin: '\'', + end: '\'', + contains: [ hljs.BACKSLASH_ESCAPE ] }, { - begin: '"', end: '"' + begin: '"', + end: '"' }, { - begin: '`', end: '`', - contains: [hljs.BACKSLASH_ESCAPE] + begin: '`', + end: '`', + contains: [ hljs.BACKSLASH_ESCAPE ] }, { begin: /\{\w+\}/, @@ -128,22 +152,36 @@ export default function(hljs) { hljs.HASH_COMMENT_MODE, { className: 'regexp', - begin: '(s|tr|y)/(\\\\.|[^/])*/(\\\\.|[^/])*/[a-z]*', + begin: regex.concat( + /(s|tr|y)/, + /\//, + /(\\.|[^\\\/])*/, + /\//, + /(\\.|[^\\\/])*/, + /\//, + REGEX_MODIFIERS, + ), relevance: 10 }, { className: 'regexp', - begin: '(m|qr)?/', end: '/[a-z]*', - contains: [hljs.BACKSLASH_ESCAPE], + begin: /(m|qr)?\//, + end: regex.concat( + /\//, + REGEX_MODIFIERS + ), + contains: [ hljs.BACKSLASH_ESCAPE ], relevance: 0 // allows empty "//" which is a common comment delimiter in other languages } ] }, { className: 'function', - beginKeywords: 'sub', end: '(\\s*\\(.*?\\))?[;{]', excludeEnd: true, + beginKeywords: 'sub', + end: '(\\s*\\(.*?\\))?[;{]', + excludeEnd: true, relevance: 5, - contains: [hljs.TITLE_MODE] + contains: [ hljs.TITLE_MODE ] }, { begin: '-\\w\\b', @@ -155,9 +193,9 @@ export default function(hljs) { subLanguage: 'mojolicious', contains: [ { - begin: "^@@.*", - end: "$", - className: "comment" + begin: "^@@.*", + end: "$", + className: "comment" } ] } @@ -167,7 +205,10 @@ export default function(hljs) { return { name: 'Perl', - aliases: ['pl', 'pm'], + aliases: [ + 'pl', + 'pm' + ], keywords: PERL_KEYWORDS, contains: PERL_DEFAULT_CONTAINS }; diff --git a/src/languages/powershell.js b/src/languages/powershell.js index 3d10a7404f..dd7b4a64d4 100644 --- a/src/languages/powershell.js +++ b/src/languages/powershell.js @@ -31,7 +31,7 @@ export default function(hljs) { 'Search|Select|Set|Show|Skip|Split|Step|Switch|Undo|Unlock|' + 'Watch|Backup|Checkpoint|Compare|Compress|Convert|ConvertFrom|' + 'ConvertTo|Dismount|Edit|Expand|Export|Group|Import|Initialize|' + - 'Limit|Merge|New|Out|Publish|Restore|Save|Sync|Unpublish|Update|' + + 'Limit|Merge|Out|Publish|Restore|Save|Sync|Unpublish|Update|' + 'Approve|Assert|Complete|Confirm|Deny|Disable|Enable|Install|Invoke|Register|' + 'Request|Restart|Resume|Start|Stop|Submit|Suspend|Uninstall|' + 'Unregister|Wait|Debug|Measure|Ping|Repair|Resolve|Test|Trace|Connect|' + diff --git a/src/languages/r.js b/src/languages/r.js index 1c22f62ba4..2b7f8e07da 100644 --- a/src/languages/r.js +++ b/src/languages/r.js @@ -97,7 +97,7 @@ export default function(hljs) { className: 'variable', variants: [ { begin: IDENT_RE }, - { begin: /`(?:\\.|[^`])+`/ } + { begin: /`(?:\\.|[^`\\])+`/ } ], endsParent: true } diff --git a/src/languages/reasonml.js b/src/languages/reasonml.js index b67c568712..648d621cb0 100644 --- a/src/languages/reasonml.js +++ b/src/languages/reasonml.js @@ -23,8 +23,8 @@ export default function(hljs) { const RE_MODULE_IDENT = '`?[A-Z$_][0-9a-zA-Z$_]*'; const RE_PARAM_TYPEPARAM = '\'?[a-z$_][0-9a-z$_]*'; - const RE_PARAM_TYPE = '\s*:\s*[a-z$_][0-9a-z$_]*(\(\s*(' + RE_PARAM_TYPEPARAM + '\s*(,' + RE_PARAM_TYPEPARAM + ')*)?\s*\))?'; - const RE_PARAM = RE_IDENT + '(' + RE_PARAM_TYPE + ')?(' + RE_PARAM_TYPE + ')?'; + const RE_PARAM_TYPE = '\\s*:\\s*[a-z$_][0-9a-z$_]*(\\(\\s*(' + RE_PARAM_TYPEPARAM + '\\s*(,' + RE_PARAM_TYPEPARAM + '\\s*)*)?\\))?'; + const RE_PARAM = RE_IDENT + '(' + RE_PARAM_TYPE + '){0,2}'; const RE_OPERATOR = "(" + orReValues([ '||', '++', diff --git a/src/languages/routeros.js b/src/languages/routeros.js index bbe5c3ac90..372d96cd8d 100644 --- a/src/languages/routeros.js +++ b/src/languages/routeros.js @@ -22,7 +22,7 @@ export default function(hljs) { const LITERALS = 'true false yes no nothing nil null'; - const OBJECTS = 'traffic-flow traffic-generator firewall scheduler aaa accounting address-list address align area bandwidth-server bfd bgp bridge client clock community config connection console customer default dhcp-client dhcp-server discovery dns e-mail ethernet filter firewall firmware gps graphing group hardware health hotspot identity igmp-proxy incoming instance interface ip ipsec ipv6 irq l2tp-server lcd ldp logging mac-server mac-winbox mangle manual mirror mme mpls nat nd neighbor network note ntp ospf ospf-v3 ovpn-server page peer pim ping policy pool port ppp pppoe-client pptp-server prefix profile proposal proxy queue radius resource rip ripng route routing screen script security-profiles server service service-port settings shares smb sms sniffer snmp snooper socks sstp-server system tool tracking type upgrade upnp user-manager users user vlan secret vrrp watchdog web-access wireless pptp pppoe lan wan layer7-protocol lease simple raw'; + const OBJECTS = 'traffic-flow traffic-generator firewall scheduler aaa accounting address-list address align area bandwidth-server bfd bgp bridge client clock community config connection console customer default dhcp-client dhcp-server discovery dns e-mail ethernet filter firmware gps graphing group hardware health hotspot identity igmp-proxy incoming instance interface ip ipsec ipv6 irq l2tp-server lcd ldp logging mac-server mac-winbox mangle manual mirror mme mpls nat nd neighbor network note ntp ospf ospf-v3 ovpn-server page peer pim ping policy pool port ppp pppoe-client pptp-server prefix profile proposal proxy queue radius resource rip ripng route routing screen script security-profiles server service service-port settings shares smb sms sniffer snmp snooper socks sstp-server system tool tracking type upgrade upnp user-manager users user vlan secret vrrp watchdog web-access wireless pptp pppoe lan wan layer7-protocol lease simple raw'; // print parameters // Several parameters are available for print command: diff --git a/src/languages/ruby.js b/src/languages/ruby.js index 47763bf9e3..cf6dc8c88d 100644 --- a/src/languages/ruby.js +++ b/src/languages/ruby.js @@ -72,7 +72,7 @@ export default function(hljs) { begin: /\B\?(\\\d{1,3}|\\x[A-Fa-f0-9]{1,2}|\\u[A-Fa-f0-9]{4}|\\?\S)\b/ }, { // heredocs - begin: /<<[-~]?'?(\w+)(?:.|\n)*?\n\s*\1\b/, + begin: /<<[-~]?'?(\w+)\n(?:[^\n]*\n)*?\s*\1\b/, returnBegin: true, contains: [ { begin: /<<[-~]?'?/ }, @@ -201,7 +201,7 @@ export default function(hljs) { var SIMPLE_PROMPT = "[>?]>"; // irb(main):001:0> var DEFAULT_PROMPT = "[\\w#]+\\(\\w+\\):\\d+:\\d+>"; - var RVM_PROMPT = "(\\w+-)?\\d+\\.\\d+\\.\\d(p\\d+)?[^>]+>"; + var RVM_PROMPT = "(\\w+-)?\\d+\\.\\d+\\.\\d+(p\\d+)?[^\\d][^>]+>"; var IRB_DEFAULT = [ { diff --git a/src/languages/ruleslanguage.js b/src/languages/ruleslanguage.js index 664d8bdef5..7a7007c3c0 100644 --- a/src/languages/ruleslanguage.js +++ b/src/languages/ruleslanguage.js @@ -63,7 +63,7 @@ export default function(hljs) { className: 'literal', variants: [ { // looks like #-comment - begin: '#\\s+[a-zA-Z .]*', + begin: '#\\s+', relevance: 0 }, { diff --git a/src/languages/scilab.js b/src/languages/scilab.js index 3703b424d2..b45ce3929d 100644 --- a/src/languages/scilab.js +++ b/src/languages/scilab.js @@ -53,14 +53,15 @@ export default function(hljs) { } ] }, + // seems to be a guard against [ident]' or [ident]. + // perhaps to prevent attributes from flagging as keywords? { - begin: '[a-zA-Z_][a-zA-Z_0-9]*(\'+[\\.\']*|[\\.\']+)', - end: '', + begin: '[a-zA-Z_][a-zA-Z_0-9]*[\\.\']+', relevance: 0 }, { begin: '\\[', - end: '\\]\'*[\\.\']*', + end: '\\][\\.\']*', relevance: 0, contains: COMMON_CONTAINS }, diff --git a/src/languages/sqf.js b/src/languages/sqf.js index 8db3c34ccc..e8712a598a 100644 --- a/src/languages/sqf.js +++ b/src/languages/sqf.js @@ -11,7 +11,7 @@ export default function(hljs) { // In SQF, a variable start with _ const VARIABLE = { className: 'variable', - begin: /\b_+[a-zA-Z_]\w*/ + begin: /\b_+[a-zA-Z]\w*/ }; // In SQF, a function should fit myTag_fnc_myFunction pattern diff --git a/src/languages/x86asm.js b/src/languages/x86asm.js index 80cd074477..8d7fd886dc 100644 --- a/src/languages/x86asm.js +++ b/src/languages/x86asm.js @@ -87,7 +87,7 @@ export default function(hljs) { // Float number and x87 BCD { begin: '\\b(?:([0-9][0-9_]*)?\\.[0-9_]*(?:[eE][+-]?[0-9_]+)?|' + - '(0[Xx])?[0-9][0-9_]*\\.?[0-9_]*(?:[pP](?:[+-]?[0-9_]+)?)?)\\b', + '(0[Xx])?[0-9][0-9_]*(\\.[0-9_]*)?(?:[pP](?:[+-]?[0-9_]+)?)?)\\b', relevance: 0 }, diff --git a/src/languages/xquery.js b/src/languages/xquery.js index c34adcf255..f3eae61cb1 100644 --- a/src/languages/xquery.js +++ b/src/languages/xquery.js @@ -149,7 +149,7 @@ export default function(hljs) { // mocha: direct_method const DIRECT = { - begin: /<([\w\._:\-]+)((\s*.*)=('|").*('|"))?>/, + begin: /<([\w\._:\-]+)(\s+\S*=('|").*('|"))?>/, end: /(\/[\w\._:\-]+>)/, subLanguage: 'xml', contains: [ diff --git a/src/languages/yaml.js b/src/languages/yaml.js index 35f9105670..d490528677 100644 --- a/src/languages/yaml.js +++ b/src/languages/yaml.js @@ -102,7 +102,7 @@ export default function(hljs) { // Indentation of subsequent lines must be the same to // be considered part of the block className: 'string', - begin: '[\\|>]([1-9]?[+-])?[ ]*\\n( +)[^\\n]+\\n(\\2[^\\n]+\\n?)*' + begin: '[\\|>]([1-9]?[+-])?[ ]*\\n( +)[^ ][^\\n]*\\n(\\2[^\\n]+\\n?)*' }, { // Ruby/Rails erb begin: '<%[%=-]?', diff --git a/src/lib/regex.js b/src/lib/regex.js index 1667752918..db62fcf12b 100644 --- a/src/lib/regex.js +++ b/src/lib/regex.js @@ -25,6 +25,14 @@ export function lookahead(re) { return concat('(?=', re, ')'); } +/** + * @param {RegExp | string } re + * @returns {string} + */ +export function anyNumberOfTimes(re) { + return concat('(', re, ')*'); +} + /** * @param {RegExp | string } re * @returns {string} diff --git a/test/index.js b/test/index.js index 57923ed69c..62a78cdad6 100644 --- a/test/index.js +++ b/test/index.js @@ -19,6 +19,9 @@ require('./detect'); // theses highlighting errors from cropping up again. require('./markup'); +// check regex for fatal issues like exponential backtracking, etc +require('./regex'); + // Tests meant for the browser only. Using the `test/fixtures/index.html` file // along with `jsdom` these tests check for things like: custom markup already // existing in the code being highlighted, blocks that disable highlighting, diff --git a/test/markup/dsconfig/default.expect.txt b/test/markup/dsconfig/default.expect.txt index 2ba24df5e9..bcb6aad8ba 100644 --- a/test/markup/dsconfig/default.expect.txt +++ b/test/markup/dsconfig/default.expect.txt @@ -14,11 +14,11 @@ --set enabled:true --set evaluation-order-index:100 \ --set 'connection-criteria:User.1 Connection Criteria' \ # Property without value - --reset maximum-concurrent-connections -# Unquoted property, quoted property value + --reset maximum-concurrent-connections +# Unquoted property, quoted property value dsconfig set-access-control-handler-prop \ - --add global-aci:'(target="ldap:///cn=config")(targetattr="*")(version 3.0; acl "Allow access to the config tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' \ - --add global-aci:'(target="ldap:///cn=monitor")(targetattr="*")(version 3.0; acl "Allow access to the monitor tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' \ - --remove global-aci:'(target="ldap:///cn=alerts")(targetattr="*")(version 3.0; acl "Allow access to the alerts tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' + --add global-aci:'(target="ldap:///cn=config")(targetattr="*")(version 3.0; acl "Allow access to the config tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' \ + --add global-aci:'(target="ldap:///cn=monitor")(targetattr="*")(version 3.0; acl "Allow access to the monitor tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' \ + --remove global-aci:'(target="ldap:///cn=alerts")(targetattr="*")(version 3.0; acl "Allow access to the alerts tree by cn=admin,c=us"; allow(all) groupdn="ldap:///cn=directory administrators,ou=groups,c=us";)' # No continuation dsconfig delete-log-publisher --publisher-name "File-Based Error Logger" diff --git a/test/markup/gcode/default.expect.txt b/test/markup/gcode/default.expect.txt index ad4d148f8c..500e8514c4 100644 --- a/test/markup/gcode/default.expect.txt +++ b/test/markup/gcode/default.expect.txt @@ -25,7 +25,7 @@ N23 IF [#1 LT 0.370] GOTO 49 N24 X-0.678 Y+.990 N25 G84.3 X-0.1 -N26 #4=#5*COS[45] -N27 #4=#5*SIN[45] +N26 #4=#5*COS[45] +N27 #4=#5*SIN[45] N28 VZOFZ=652.9658 % diff --git a/test/regex/index.js b/test/regex/index.js new file mode 100644 index 0000000000..426568ba4f --- /dev/null +++ b/test/regex/index.js @@ -0,0 +1,407 @@ +'use strict'; + +const hljs = require('../../build'); +const { BFS, parseRegex, regexFor } = require('./lib/util.js'); +const { visitRegExpAST } = require('regexpp'); +const { JS, Words, NFA, CharSet } = require('refa'); +const { firstOf, underAStar, isFirstMatch, isAlwaysZeroWidth} = require('./lib/analysis.js'); + +hljs.debugMode(); + +/** + * A map for a regex pattern to whether or not it it vulnerable to exponential backtracking. + * + * @type {Record} + */ +const expBacktrackingCache = {}; + +/** + * A map for a regex pattern to whether or not it it vulnerable to polynomial backtracking. + * + * @type {Record} + */ +const polyBacktrackingCache = {}; + +function retrieveRules(language, { name }) { + // first we need to get the language compiled so we have + // access to the raw regex + hljs.highlight(name, ""); + return regexFor(language, { context: name, depth: 0 }); +} + +function forEachPattern(list, fn) { + const errors = []; + for (const rule of list) { + // console.log(rule) + const ast = parseRegex(rule.re); + fn({ + ast, + pattern: rule.re, + rulePath: rule.path, + reportError: message => errors.push(message) + }); + }; + if (errors.length > 0) { + throw new Error(errors.map(e => String(e.message || e)).join('\n\n')); + } +} + +function testLanguage(languageName) { + const language = hljs.getLanguage(languageName); + const rules = retrieveRules(language, { name: languageName }); + count += rules.length; + describe(languageName, function() { + it("have a name", function() { + language.name.should.not.equal(undefined); + }); + + // it('should not match the empty string', function () { + // forEachPattern(rules, ({ pattern, rulePath }) => { + // ''.should.not.match(pattern, `${rulePath}: ${pattern} should not match the empty string.\n\n` + + // `Patterns that do match the empty string can potentially cause infinitely many empty tokens. ` + + // `Make sure that all patterns always consume at least one character.`); + // }); + // }); + + it(`have ${rules.length} regex matchers`, () => {} ); + + it('should not use octal escapes', function() { + forEachPattern(rules, ({ ast, rulePath, reportError }) => { + visitRegExpAST(ast.pattern, { + onCharacterEnter(node) { + if (/^\\(?:[1-9]|\d{2,})$/.test(node.raw)) { + reportError(`${rulePath}: Octal escape ${node.raw}.\n\n` + + `Octal escapes can be confused with backreferences, so please do not use them.\n` + + `To fix this, use a different escape method. ` + + `Note that this could also be an invalid backreference, so be sure to carefully analyse the pattern.`); + } + } + }); + }); + }); + + it('should not cause exponential backtracking', function () { + forEachPattern(rules, ({ pattern, ast, rulePath, reportError }) => { + const patternStr = String(pattern); + if (expBacktrackingCache[patternStr] === false) { + // we know that the pattern won't cause exp backtracking because we checked before + return; + } + + const parser = JS.Parser.fromAst(ast); + /** + * Parses the given element and creates its NFA. + * + * @param {import("refa").JS.ParsableElement} element + * @returns {NFA} + */ + function toNFA(element, debug = false) { + const { expression, maxCharacter } = parser.parseElement(element, { + backreferences: "resolve", + lookarounds: "disable", + }); + return NFA.fromRegex(expression, { maxCharacter }); + } + + /** + * Checks whether the alternatives of the given node are disjoint. If the alternatives are not disjoint + * and the give node is a descendant of an effective Kleene star, then an error will be thrown. + * + * @param {CapturingGroup | Group | LookaroundAssertion} node + * @returns {void} + */ + function checkDisjointAlternatives(node) { + if (!underAStar(node) || node.alternatives.length < 2) { + return; + } + + const alternatives = node.alternatives; + + const total = toNFA(alternatives[0]); + total.removeEmptyWord(); + for (let i = 1, l = alternatives.length; i < l; i++) { + const a = alternatives[i]; + const current = toNFA(a); + current.removeEmptyWord(); + + if (!total.isDisjointWith(current)) { + reportError(`${rulePath}: The alternative \`${a.raw}\` is not disjoint with at least one previous alternative.` + + ` This will cause exponential backtracking.` + + `\n\nTo fix this issue, you have to rewrite the ${node.type} \`${node.raw}\`.` + + ` The goal is that all of its alternatives are disjoint.` + + ` This means that if a (sub-)string is matched by the ${node.type}, then only one of its alternatives can match the (sub-)string.` + + `\n\nExample: \`(?:[ab]|\\w|::)+\`` + + `\nThe alternatives of the group are not disjoint because the string "a" can be matched by both \`[ab]\` and \`\\w\`.` + + ` In this example, the pattern by easily fixed because the \`[ab]\` is a subset of the \`\\w\`, so its enough to remove the \`[ab]\` alternative to get \`(?:\\w|::)+\` as the fixed pattern.` + + `\nIn the real world, patterns can be a lot harder to fix.` + + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.` + + ` A maintainer will help you.` + + `\n\nFull pattern:\n${pattern}`); + } else if (i !== l - 1) { + total.union(current); + } + } + } + + visitRegExpAST(ast.pattern, { + onCapturingGroupLeave: checkDisjointAlternatives, + onGroupLeave: checkDisjointAlternatives, + onAssertionLeave(node) { + if (node.kind === "lookahead" || node.kind === "lookbehind") { + checkDisjointAlternatives(node); + } + }, + + onQuantifierLeave(node) { + if (node.max < 10) { + return; // not a star + } + if (node.element.type !== "CapturingGroup" && node.element.type !== "Group") { + return; // not a group + } + + // The idea here is the following: + // + // We have found a part `A*` of the regex (`A` is assumed to not accept the empty word). Let `I` be + // the intersection of `A` and `A{2,}`. If `I` is not empty, then there exists a non-empty word `w` + // that is accepted by both `A` and `A{2,}`. That means that there exists some `m>1` for which `w` + // is accepted by `A{m}`. + // This means that there are at least two ways `A*` can accept `w`. It can be accepted as `A` or as + // `A{m}`. Hence there are at least 2^n ways for `A*` to accept the word `w{n}`. This is the main + // requirement for exponential backtracking. + // + // This is actually only a crude approximation for the real analysis that would have to be done. We + // would actually have to check the intersection `A{p}` and `A{p+1,}` for all p>0. However, in most + // cases, the approximation is good enough. + + const nfa = toNFA(node.element, true); + nfa.removeEmptyWord(); + const twoStar = nfa.copy(); + twoStar.quantify(2, Infinity); + + if (!nfa.isDisjointWith(twoStar)) { + const example = Words.fromUnicodeToString(firstOf(NFA.intersectionWords(nfa, twoStar))); + + reportError(`${rulePath}: The quantifier \`${node.raw}\` ambiguous for all words ${JSON.stringify(example)}.repeat(n) for any n>1.` + + ` This will cause exponential backtracking.` + + `\n\nTo fix this issue, you have to rewrite the element (let's call it E) of the quantifier.` + + ` The goal is modify E such that it is disjoint with repetitions of itself.` + + ` This means that if a (sub-)string is matched by E, then it must not be possible for E{2}, E{3}, E{4}, etc. to match that (sub-)string.` + + `\n\nExample: \`(?:\\w+|::)+\`` + + `\nThe problem lies in \`\\w+\` because \`\\w+\` and \`(?:\\w+){2}\` are not disjoint as the string "aa" is fully matched by both.` + + ` In this example, the pattern by easily fixed by changing \`\\w+\` to \`\\w\`.` + + `\nIn the real world, patterns can be a lot harder to fix.` + + ` If you are trying to make the tests pass for a pull request but can\'t fix the issue yourself, then make the pull request (or commit) anyway.` + + ` A maintainer will help you.` + + `\n\nFull pattern:\n${pattern}`); + } + }, + }); + + expBacktrackingCache[patternStr] = false; + }); + }); + it('should not cause polynomial backtracking', function () { + forEachPattern(rules, ({ pattern, ast, rulePath, reportError }) => { + const patternStr = String(pattern); + if (polyBacktrackingCache[patternStr] === false) { + // we know that the pattern won't cause poly backtracking because we checked before + return; + } + + const EMPTY = ast.flags.unicode ? CharSet.empty(0x10FFFF) : CharSet.empty(0xFFFF); + + /** + * @param {Node} node + * @returns {CharSet} + */ + function toCharSet(node) { + switch (node.type) { + case "Alternative": { + if (node.elements.length === 1) { + return toCharSet(node.elements[0]); + } + return EMPTY; + } + case "CapturingGroup": + case "Group": { + let total = EMPTY; + for (const item of node.alternatives) { + total = total.union(toCharSet(item)); + } + return total; + } + case "Character": + return JS.createCharSet([node.value], ast.flags); + case "CharacterClass": { + const value = JS.createCharSet(node.elements.map(x => { + if (x.type === "CharacterSet") { + return x; + } else if (x.type === "Character") { + return x.value; + } else { + return { min: x.min.value, max: x.max.value }; + } + }), ast.flags); + if (node.negate) { + return value.negate(); + } else { + return value; + } + } + case "CharacterSet": + return JS.createCharSet([node], ast.flags); + + default: + return EMPTY; + } + } + + /** + * @param {Element} from + * @returns {Element | null} + */ + function getAfter(from) { + const parent = from.parent; + if (parent.type === "Quantifier") { + return getAfter(parent); + } else if (parent.type === "Alternative") { + const index = parent.elements.indexOf(from); + const after = parent.elements[index + 1]; + if (after) { + return after; + } else { + const grandParent = parent.parent; + if (grandParent.type === "Pattern") { + return null; + } else { + return getAfter(grandParent); + } + } + } else { + throw Error("Unreachable"); + } + } + + visitRegExpAST(ast.pattern, { + onQuantifierLeave(node) { + if (node.max !== Infinity) { + return; + } + const char = toCharSet(node.element); + tryReachUntil(getAfter(node), char, null); + + /** + * @param {Quantifier} quantifier + * @param {CharSet} char + */ + function assertNoPoly(quantifier, char) { + if (quantifier.max === Infinity) { + const qChar = toCharSet(quantifier.element); + if (qChar && !qChar.isDisjointWith(char)) { + const intersection = qChar.intersect(char); + const literal = JS.toLiteral({ + type: "Concatenation", + elements: [ + { type: "CharacterClass", characters: intersection } + ] + }) + const lang = `/${literal.source}/${literal.flags}`; + + const rangeStr = patternStr.substring(node.start + 1, quantifier.end + 1); + const rangeHighlight = `^${"~".repeat(node.end - node.start - 1)}${" ".repeat(quantifier.start - node.end)}^${"~".repeat(quantifier.end - quantifier.start - 1)}`; + + reportError(`${rulePath}: Polynomial backtracking. By repeating any character that matches ${lang}, an attack string can be created.\n\n ${rangeStr}\n ${rangeHighlight}\n\nFull pattern:\n${patternStr}\n${" ".repeat(node.start + 1)}${rangeHighlight}`); + } + } + } + + /** + * @param {Element | null | undefined} element + * @param {CharSet} char + * @param {Element | null | undefined} until + * @returns {CharSet} + */ + function tryReachUntil(element, char, until) { + if (!element || element == until || char.isEmpty) { + return char; + } + + const after = getAfter(element); + + if (element.type === "Quantifier") { + assertNoPoly(element, char); + } + + return tryReachUntil(after, goInto(element, after, char), until); + } + + /** + * @param {Element} element + * @param {Element} after + * @param {CharSet} char + * @returns {CharSet} + */ + function goInto(element, after, char) { + switch (element.type) { + case "Assertion": { + if (element.kind === "lookahead" || element.kind === "lookbehind") { + for (const alt of element.alternatives) { + if (alt.elements.length > 0) { + tryReachUntil(alt.elements[0], char, after); + } + } + } + return EMPTY; + } + case "Group": + case "CapturingGroup": { + let total = EMPTY; + for (const alt of element.alternatives) { + if (alt.elements.length > 0) { + total = total.union(tryReachUntil(alt.elements[0], char, after)); + } else { + total = char; + } + } + return total; + } + case "Character": + case "CharacterClass": + case "CharacterSet": { + return char.intersect(toCharSet(element)); + } + case "Quantifier": { + if (element.min === 0) { + goInto(element.element, after, char); + return char; + } else { + return goInto(element.element, after, char); + } + } + default: + return EMPTY; + } + } + }, + }); + + polyBacktrackingCache[patternStr] = false; + }); + }); + }); +} + +let count = 0; +let languages = hljs.listLanguages(); +if (process.env.ONLY_LANG) { + languages = [process.env.ONLY_LANG]; +} + +for (const language of languages) { + testLanguage(language); +} + +describe("COMBINED: All grammars", () => { + it(`have ${count} total regex`, () => {}); +}); diff --git a/test/regex/lib/analysis.js b/test/regex/lib/analysis.js new file mode 100644 index 0000000000..3b14fbcb4a --- /dev/null +++ b/test/regex/lib/analysis.js @@ -0,0 +1,87 @@ +/** + * Returns whether the given element will always have zero width meaning that it doesn't consume characters. + * + * @param {Element} element + * @returns {boolean} + */ +function isAlwaysZeroWidth(element) { + switch (element.type) { + case 'Assertion': + // assertions == ^, $, \b, lookarounds + return true; + case 'Quantifier': + return element.max === 0 || isAlwaysZeroWidth(element.element); + case 'CapturingGroup': + case 'Group': + // every element in every alternative has to be of zero length + return element.alternatives.every(alt => alt.elements.every(isAlwaysZeroWidth)); + case 'Backreference': + // on if the group referred to is of zero length + return isAlwaysZeroWidth(element.resolved); + default: + return false; // what's left are characters + } +} + +/** + * Returns whether the given element will always at the start of the whole match. + * + * @param {Element} element + * @returns {boolean} + */ +function isFirstMatch(element) { + const parent = element.parent; + switch (parent.type) { + case 'Alternative': + // all elements before this element have to of zero length + if (!parent.elements.slice(0, parent.elements.indexOf(element)).every(isAlwaysZeroWidth)) { + return false; + } + const grandParent = parent.parent; + if (grandParent.type === 'Pattern') { + return true; + } else { + return isFirstMatch(grandParent); + } + + case 'Quantifier': + if (parent.max >= 2) { + return false; + } else { + return isFirstMatch(parent); + } + + default: + throw new Error(`Internal error: The given node should not be a '${element.type}'.`); + } +} + +/** + * Returns whether the given node either is or is a child of what is effectively a Kleene star. + * + * @param {import("regexpp/ast").Node} node + * @returns {boolean} + */ +function underAStar(node) { + if (node.type === "Quantifier" && node.max > 10) { + return true; + } else if (node.parent) { + return underAStar(node.parent); + } else { + return false; + } +} + +/** + * @param {Iterable} iter + * @returns {T | undefined} + * @template T + */ +function firstOf(iter) { + for (const item of iter) { + return item; + } + return undefined; +} + +module.exports = { firstOf, underAStar, isFirstMatch, isAlwaysZeroWidth}; diff --git a/test/regex/lib/util.js b/test/regex/lib/util.js new file mode 100644 index 0000000000..f323a37264 --- /dev/null +++ b/test/regex/lib/util.js @@ -0,0 +1,107 @@ +/* eslint-disable no-undefined */ + +const { RegExpParser } = require('regexpp'); + +/** + * @typedef {import("regexpp/ast").Pattern} Pattern + * @typedef {import("regexpp/ast").Flags} Flags + * @typedef {{ pattern: Pattern, flags: Flags }} LiteralAST + */ + +const parser = new RegExpParser({ strict: false, ecmaVersion: 6 }); +/** @type {Map} */ +const astCache = new Map(); + +// exclude our common "match anything" matchers +function matchAny(re) { + return re.source === "\\B|\\b"; +} + +function regexFor(mode, { context, depth }) { + if (mode.analyzed) return []; + mode.analyzed = true; + + let list = []; + if (mode.beginRe && !matchAny(mode.beginRe)) list.push({ path: `${context}/begin`, re: mode.beginRe }); + if (mode.endRe && !matchAny(mode.endRe)) list.push({ path: `${context}/end`, re: mode.endRe }); + if (mode.illegalRe) list.push({ path: `${context}/illegal`, re: mode.illegalRe }); + if (mode.keywordPatternRe && mode.keywordPatternRe.source !== "\\w+") { + list.push({ path: `${context}/$keyword_pattern`, re: mode.keywordPatternRe }); + } + if (mode.contains.length) { + mode.contains.forEach((mode, i) => { + const nodeName = `[${i}]${mode.className || ""}`; + const modes = regexFor(mode, { context: `${context}/${nodeName}`, depth: depth + 1 }); + list = [...list, ...modes]; + }); + } + if (mode.starts) { + const nodeName = "$starts"; + const modes = regexFor(mode.starts, { context: `${context}/${nodeName}`, depth: depth + 1 }); + list = [...list, ...modes]; + } + return list; +} + +/** + * Performs a breadth-first search on the given start element. + * + * @param {any} start + * @param {(path: { key: string, value: any }[]) => void} callback + */ +const BFS = (start, callback) => { + const visited = new Set(); + /** @type {{ key: string, value: any }[][]} */ + let toVisit = [ + [{ key: null, value: start }] + ]; + + callback(toVisit[0]); + + while (toVisit.length > 0) { + /** @type {{ key: string, value: any }[][]} */ + const newToVisit = []; + + for (const path of toVisit) { + const obj = path[path.length - 1].value; + if (!visited.has(obj)) { + visited.add(obj); + + for (const key in obj) { + const value = obj[key]; + + path.push({ key, value }); + callback(path); + + if (Array.isArray(value) || Object.prototype.toString.call(value) === '[object Object]') { + newToVisit.push([...path]); + } + + path.pop(); + } + } + } + + toVisit = newToVisit; + } +}; + +/** + * Returns the AST of a given pattern. + * + * @param {RegExp} regex + * @returns {LiteralAST} + */ +const parseRegex = (regex) => { + const key = regex.toString(); + let literal = astCache.get(key); + if (literal === undefined) { + const flags = parser.parseFlags(regex.flags, undefined); + const pattern = parser.parsePattern(regex.source, undefined, undefined, flags.unicode); + literal = { pattern, flags }; + astCache.set(key, literal); + } + return literal; +}; + +module.exports = { BFS, regexFor, parseRegex };