Use list of legal chars in URL from the WHATWG standard

Notably this excludes some ASCII chars: <>{}[]`| See https://url.spec.whatwg.org/#url-code-points Fixes #7095
kovidgoyal · Feb 5, 2024 · 8cc2cad · 8cc2cad
1 parent 5f8e5b0
commit 8cc2cad
Show file tree

Hide file tree

Showing 11 changed files with 187 additions and 16 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -58,6 +58,12 @@ Detailed list of changes
   row/column boxes by resizing them using linear instead of nearest neighbor
   interpolation on the GPU (:iss:`7070`)
 
+- When matching URLs use the definition of legal characters in URLs from the
+  `WHATWG spec <https://url.spec.whatwg.org/#url-code-points>`__ rather than older standards (:iss:`7095`)
+
+- hints kitten: Respect the kitty :opt:`url_excluded_characters` option
+  (:iss:`7075`)
+
 0.32.1 [2024-01-26]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/gen/go_code.py b/gen/go_code.py
@@ -609,13 +609,13 @@ def generate_constants() -> str:
 var DocTitleMap = map[string]string{serialize_go_dict(ref_map['doc'])}
 var AllowedShellIntegrationValues = []string{{ {str(sorted(allowed_shell_integration_values))[1:-1].replace("'", '"')} }}
 var KittyConfigDefaults = struct {{
-Term, Shell_integration, Select_by_word_characters, Shell string
+Term, Shell_integration, Select_by_word_characters, Url_excluded_characters, Shell string
 Wheel_scroll_multiplier int
 Url_prefixes []string
 }}{{
 Term: "{Options.term}", Shell_integration: "{' '.join(Options.shell_integration)}", Url_prefixes: []string{{ {url_prefixes} }},
 Select_by_word_characters: `{Options.select_by_word_characters}`, Wheel_scroll_multiplier: {Options.wheel_scroll_multiplier},
-Shell: "{Options.shell}",
+Shell: "{Options.shell}", Url_excluded_characters: "{Options.url_excluded_characters}",
 }}
 '''  # }}}
 

diff --git a/gen/wcwidth.py b/gen/wcwidth.py
@@ -404,6 +404,7 @@ def gen_ucd() -> None:
     cz = {c for c in class_maps if c[0] in 'CZ'}
     with create_header('kitty/unicode-data.c') as p:
         p('#include "unicode-data.h"')
+        p('START_ALLOW_CASE_RANGE')
         category_test(
                 'is_combining_char', p,
                 (),
@@ -553,7 +554,6 @@ def gen_rowcolumn_diacritics() -> None:
 
     go_file = 'tools/utils/images/rowcolumn_diacritics.go'
     with create_header('kitty/rowcolumn-diacritics.c') as p, create_header(go_file, include_data_types=False) as g:
-        p('#include "unicode-data.h"')
         p('int diacritic_to_num(char_type code) {')
         p('\tswitch (code) {')
         g('package images')

diff --git a/kittens/hints/main.py b/kittens/hints/main.py
@@ -148,6 +148,13 @@ def custom_marking() -> None:
 prefixes defined by the :opt:`url_prefixes` option in :file:`kitty.conf`.
 
 
+--url-excluded-characters
+default=default
+Characters to exclude when matching URLs. Defaults to the list of characters
+defined by the :opt:`url_excluded_characters` option in :file:`kitty.conf`.
+The syntax for this option is the same as for :opt:`url_excluded_characters`.
+
+
 --word-characters
 Characters to consider as part of a word. In addition, all characters marked as
 alphanumeric in the Unicode database will be considered as word characters.

diff --git a/kittens/hints/marks.go b/kittens/hints/marks.go
@@ -207,17 +207,24 @@ var PostProcessorMap = sync.OnceValue(func() map[string]PostProcessorFunc {
 
 type KittyOpts struct {
 	Url_prefixes              *utils.Set[string]
+	Url_excluded_characters   string
 	Select_by_word_characters string
 }
 
 func read_relevant_kitty_opts(path string) KittyOpts {
-	ans := KittyOpts{Select_by_word_characters: kitty.KittyConfigDefaults.Select_by_word_characters}
+	ans := KittyOpts{
+		Select_by_word_characters: kitty.KittyConfigDefaults.Select_by_word_characters,
+		Url_excluded_characters:   kitty.KittyConfigDefaults.Url_excluded_characters}
 	handle_line := func(key, val string) error {
 		switch key {
 		case "url_prefixes":
 			ans.Url_prefixes = utils.NewSetWithItems(strings.Split(val, " ")...)
 		case "select_by_word_characters":
 			ans.Select_by_word_characters = strings.TrimSpace(val)
+		case "url_excluded_characters":
+			if s, err := config.StringLiteral(val); err == nil {
+				ans.Url_excluded_characters = s
+			}
 		}
 		return nil
 	}
@@ -236,7 +243,111 @@ var RelevantKittyOpts = sync.OnceValue(func() KittyOpts {
 var debugprintln = tty.DebugPrintln
 var _ = debugprintln
 
-func functions_for(opts *Options) (pattern string, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc) {
+func url_excluded_characters_as_ranges_for_regex(extra_excluded string) string {
+	// See https://url.spec.whatwg.org/#url-code-points
+	ans := strings.Builder{}
+	ans.Grow(4096)
+	type cr struct{ start, end rune }
+	ranges := []cr{}
+	r := func(start rune, end ...rune) {
+		if len(end) == 0 {
+			ranges = append(ranges, cr{start, start})
+		} else {
+			ranges = append(ranges, cr{start, end[0]})
+		}
+	}
+	if !strings.Contains(extra_excluded, "\n") {
+		r('\n')
+	}
+	if !strings.Contains(extra_excluded, "\r") {
+		r('\r')
+	}
+	r('!')
+	r('$')
+	r('&')
+	r('#')
+	r('\'')
+	r('/')
+	r(':')
+	r(';')
+	r('@')
+	r('_')
+	r('~')
+	r('(')
+	r(')')
+	r('*')
+	r('+')
+	r(',')
+	r('-')
+	r('.')
+	r('=')
+	r('?')
+	r('%')
+	r('a', 'z')
+	r('A', 'Z')
+	r('0', '9')
+	slices.SortFunc(ranges, func(a, b cr) int { return int(a.start - b.start) })
+	var prev rune = -1
+	for _, cr := range ranges {
+		if cr.start-1 > prev+1 {
+			ans.WriteString(regexp.QuoteMeta(string(prev + 1)))
+			ans.WriteRune('-')
+			ans.WriteString(regexp.QuoteMeta(string(cr.start - 1)))
+		}
+		prev = cr.end
+	}
+	ans.WriteString(regexp.QuoteMeta(string(ranges[len(ranges)-1].end + 1)))
+	ans.WriteRune('-')
+	ans.WriteRune(0x9f)
+	ans.WriteString(`\x{d800}-\x{dfff}`)
+	ans.WriteString(`\x{fdd0}-\x{fdef}`)
+	w := func(x rune) { ans.WriteRune(x) }
+
+	w(0xFFFE)
+	w(0xFFFF)
+	w(0x1FFFE)
+	w(0x1FFFF)
+	w(0x2FFFE)
+	w(0x2FFFF)
+	w(0x3FFFE)
+	w(0x3FFFF)
+	w(0x4FFFE)
+	w(0x4FFFF)
+	w(0x5FFFE)
+	w(0x5FFFF)
+	w(0x6FFFE)
+	w(0x6FFFF)
+	w(0x7FFFE)
+	w(0x7FFFF)
+	w(0x8FFFE)
+	w(0x8FFFF)
+	w(0x9FFFE)
+	w(0x9FFFF)
+	w(0xAFFFE)
+	w(0xAFFFF)
+	w(0xBFFFE)
+	w(0xBFFFF)
+	w(0xCFFFE)
+	w(0xCFFFF)
+	w(0xDFFFE)
+	w(0xDFFFF)
+	w(0xEFFFE)
+	w(0xEFFFF)
+	w(0xFFFFE)
+	w(0xFFFFF)
+
+	if strings.Contains(extra_excluded, "-") {
+		extra_excluded = strings.ReplaceAll(extra_excluded, "-", "")
+		extra_excluded = regexp.QuoteMeta(extra_excluded) + "-"
+	} else {
+		extra_excluded = regexp.QuoteMeta(extra_excluded)
+	}
+	ans.WriteString(extra_excluded)
+	return ans.String()
+
+}
+
+func functions_for(opts *Options) (pattern string, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, err error) {
 	switch opts.Type {
 	case "url":
 		var url_prefixes *utils.Set[string]
@@ -245,7 +356,14 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
 		} else {
 			url_prefixes = utils.NewSetWithItems(strings.Split(opts.UrlPrefixes, ",")...)
 		}
-		pattern = fmt.Sprintf(`(?:%s)://[^%s]{3,}`, strings.Join(url_prefixes.AsSlice(), "|"), URL_DELIMITERS)
+		url_excluded_characters := RelevantKittyOpts().Url_excluded_characters
+		if opts.UrlExcludedCharacters != "default" {
+			if url_excluded_characters, err = config.StringLiteral(opts.UrlExcludedCharacters); err != nil {
+				err = fmt.Errorf("Failed to parse --url-excluded-characters value: %#v with error: %w", opts.UrlExcludedCharacters, err)
+				return
+			}
+		}
+		pattern = fmt.Sprintf(`(?:%s)://[^%s]{3,}`, strings.Join(url_prefixes.AsSlice(), "|"), url_excluded_characters_as_ranges_for_regex(url_excluded_characters))
 		post_processors = append(post_processors, PostProcessorMap()["url"])
 	case "path":
 		pattern = path_regex()
@@ -530,7 +648,10 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
 	sanitized_text, hyperlinks := process_escape_codes(text)
 
 	run_basic_matching := func() error {
-		pattern, post_processors, group_processors := functions_for(opts)
+		pattern, post_processors, group_processors, err := functions_for(opts)
+		if err != nil {
+			return err
+		}
 		r, err := regexp2.Compile(pattern, regexp2.RE2)
 		if err != nil {
 			return fmt.Errorf("Failed to compile the regex pattern: %#v with error: %w", pattern, err)

diff --git a/kittens/hints/marks_test.go b/kittens/hints/marks_test.go
@@ -56,6 +56,7 @@ func TestHintMarking(t *testing.T) {
 	reset()
 	u := `http://test.me/`
 	r(u, u)
+	r(u+"#fragme", u+"#fragme")
 	r(`"`+u+`"`, u)
 	r("("+u+")", u)
 	cols = len(u)

diff --git a/kitty/rowcolumn-diacritics.c b/kitty/rowcolumn-diacritics.c
diff --git a/kitty/unicode-data.c b/kitty/unicode-data.c
diff --git a/kitty/unicode-data.h b/kitty/unicode-data.h
@@ -27,9 +27,43 @@ is_excluded_from_url(uint32_t ch) {
     return false;
 }
 
+static inline bool
+is_url_legal_char(uint32_t ch) {
+    START_ALLOW_CASE_RANGE
+    // See https://url.spec.whatwg.org/#url-code-points
+    if (ch < 0xa0) {
+        switch (ch) {
+            case '!': case '$': case '&': case '\'': case '/': case ':': case ';': case '@': case '_': case '~':
+            case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '=': case '?': case '%': case '#':
+            case 'a' ... 'z':
+            case 'A' ... 'Z':
+            case '0' ... '9':
+                return true;
+            default:
+                return false;
+        }
+    }
+    if (ch > 0x10fffd) return false;  // outside valid unicode range
+    if (0xd800 <= ch && ch <= 0xdfff) return false; // leading or trailing surrogate
+    // non-characters
+    switch (ch) {
+        case 0xfdd0 ... 0xfdef:
+        case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE: case 0x2FFFF:
+        case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF: case 0x5FFFE: case 0x5FFFF:
+        case 0x6FFFE: case 0x6FFFF: case 0x7FFFE: case 0x7FFFF: case 0x8FFFE: case 0x8FFFF:
+        case 0x9FFFE: case 0x9FFFF: case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF:
+        case 0xCFFFE: case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
+        case 0xFFFFE: case 0xFFFFF:
+            return false;
+        default:
+            return true;
+    }
+    END_ALLOW_CASE_RANGE
+}
+
 static inline bool
 is_url_char(uint32_t ch) {
-    return ch && !is_CZ_category(ch) && !is_excluded_from_url(ch);
+    return is_url_legal_char(ch) && !is_excluded_from_url(ch);
 }
 
 static inline bool

diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py
@@ -266,16 +266,17 @@ def create(t):
         l0 = create('file:///etc/test')
         self.ae(l0.url_start_at(0), 0)
 
-        for trail in '.,\\':
+        for trail in '.,\\}]>':
             lx = create("http://xyz.com" + trail)
             self.ae(lx.url_end_at(0), len(lx) - 2)
-        for trail in ')}]>':
-            lx = create("http://xyz.com" + trail)
-            self.ae(lx.url_end_at(0), len(lx) - 1)
+        for trail in ')':
+            turl = "http://xyz.com" + trail
+            lx = create(turl)
+            self.ae(len(lx) - 1, lx.url_end_at(0), repr(turl))
         l0 = create("ftp://abc/")
         self.ae(l0.url_end_at(0), len(l0) - 1)
         l2 = create("http://-abcd] ")
-        self.ae(l2.url_end_at(0), len(l2) - 2)
+        self.ae(l2.url_end_at(0), len(l2) - 3)
         l3 = create("http://ab.de           ")
         self.ae(l3.url_start_at(4), 0)
         self.ae(l3.url_start_at(5), 0)

diff --git a/kitty_tests/screen.py b/kitty_tests/screen.py
@@ -1017,11 +1017,12 @@ def t(url, x=0, y=0, before='', after=''):
 
         t('http://moo.com')
         t('http://moo.com/something?else=+&what-')
+        t('http://moo.com#fragme')
         for (st, e) in '() {} [] <>'.split():
             t('http://moo.com', before=st, after=e)
-        for trailer in ')-=]}':
+        for trailer in ')-=':
             t('http://moo.com' + trailer)
-        for trailer in '{([':
+        for trailer in '{([]}<>':
             t('http://moo.com', after=trailer)
         t('http://moo.com', x=s.columns - 9)
         t('https://wraps-by-one-char.com', before='[', after=']')