Skip to content

Commit

Permalink
Use list of legal chars in URL from the WHATWG standard
Browse files Browse the repository at this point in the history
Notably this excludes some ASCII chars: <>{}[]`|
See https://url.spec.whatwg.org/#url-code-points

Fixes #7095
  • Loading branch information
kovidgoyal committed Feb 5, 2024
1 parent 5f8e5b0 commit 8cc2cad
Show file tree
Hide file tree
Showing 11 changed files with 187 additions and 16 deletions.
6 changes: 6 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ Detailed list of changes
row/column boxes by resizing them using linear instead of nearest neighbor
interpolation on the GPU (:iss:`7070`)

- When matching URLs use the definition of legal characters in URLs from the
`WHATWG spec <https://url.spec.whatwg.org/#url-code-points>`__ rather than older standards (:iss:`7095`)

- hints kitten: Respect the kitty :opt:`url_excluded_characters` option
(:iss:`7075`)

0.32.1 [2024-01-26]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
4 changes: 2 additions & 2 deletions gen/go_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,13 +609,13 @@ def generate_constants() -> str:
var DocTitleMap = map[string]string{serialize_go_dict(ref_map['doc'])}
var AllowedShellIntegrationValues = []string{{ {str(sorted(allowed_shell_integration_values))[1:-1].replace("'", '"')} }}
var KittyConfigDefaults = struct {{
Term, Shell_integration, Select_by_word_characters, Shell string
Term, Shell_integration, Select_by_word_characters, Url_excluded_characters, Shell string
Wheel_scroll_multiplier int
Url_prefixes []string
}}{{
Term: "{Options.term}", Shell_integration: "{' '.join(Options.shell_integration)}", Url_prefixes: []string{{ {url_prefixes} }},
Select_by_word_characters: `{Options.select_by_word_characters}`, Wheel_scroll_multiplier: {Options.wheel_scroll_multiplier},
Shell: "{Options.shell}",
Shell: "{Options.shell}", Url_excluded_characters: "{Options.url_excluded_characters}",
}}
''' # }}}

Expand Down
2 changes: 1 addition & 1 deletion gen/wcwidth.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ def gen_ucd() -> None:
cz = {c for c in class_maps if c[0] in 'CZ'}
with create_header('kitty/unicode-data.c') as p:
p('#include "unicode-data.h"')
p('START_ALLOW_CASE_RANGE')
category_test(
'is_combining_char', p,
(),
Expand Down Expand Up @@ -553,7 +554,6 @@ def gen_rowcolumn_diacritics() -> None:

go_file = 'tools/utils/images/rowcolumn_diacritics.go'
with create_header('kitty/rowcolumn-diacritics.c') as p, create_header(go_file, include_data_types=False) as g:
p('#include "unicode-data.h"')
p('int diacritic_to_num(char_type code) {')
p('\tswitch (code) {')
g('package images')
Expand Down
7 changes: 7 additions & 0 deletions kittens/hints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,13 @@ def custom_marking() -> None:
prefixes defined by the :opt:`url_prefixes` option in :file:`kitty.conf`.
--url-excluded-characters
default=default
Characters to exclude when matching URLs. Defaults to the list of characters
defined by the :opt:`url_excluded_characters` option in :file:`kitty.conf`.
The syntax for this option is the same as for :opt:`url_excluded_characters`.
--word-characters
Characters to consider as part of a word. In addition, all characters marked as
alphanumeric in the Unicode database will be considered as word characters.
Expand Down
129 changes: 125 additions & 4 deletions kittens/hints/marks.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,24 @@ var PostProcessorMap = sync.OnceValue(func() map[string]PostProcessorFunc {

type KittyOpts struct {
Url_prefixes *utils.Set[string]
Url_excluded_characters string
Select_by_word_characters string
}

func read_relevant_kitty_opts(path string) KittyOpts {
ans := KittyOpts{Select_by_word_characters: kitty.KittyConfigDefaults.Select_by_word_characters}
ans := KittyOpts{
Select_by_word_characters: kitty.KittyConfigDefaults.Select_by_word_characters,
Url_excluded_characters: kitty.KittyConfigDefaults.Url_excluded_characters}
handle_line := func(key, val string) error {
switch key {
case "url_prefixes":
ans.Url_prefixes = utils.NewSetWithItems(strings.Split(val, " ")...)
case "select_by_word_characters":
ans.Select_by_word_characters = strings.TrimSpace(val)
case "url_excluded_characters":
if s, err := config.StringLiteral(val); err == nil {
ans.Url_excluded_characters = s
}
}
return nil
}
Expand All @@ -236,7 +243,111 @@ var RelevantKittyOpts = sync.OnceValue(func() KittyOpts {
var debugprintln = tty.DebugPrintln
var _ = debugprintln

func functions_for(opts *Options) (pattern string, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc) {
func url_excluded_characters_as_ranges_for_regex(extra_excluded string) string {
// See https://url.spec.whatwg.org/#url-code-points
ans := strings.Builder{}
ans.Grow(4096)
type cr struct{ start, end rune }
ranges := []cr{}
r := func(start rune, end ...rune) {
if len(end) == 0 {
ranges = append(ranges, cr{start, start})
} else {
ranges = append(ranges, cr{start, end[0]})
}
}
if !strings.Contains(extra_excluded, "\n") {
r('\n')
}
if !strings.Contains(extra_excluded, "\r") {
r('\r')
}
r('!')
r('$')
r('&')
r('#')
r('\'')
r('/')
r(':')
r(';')
r('@')
r('_')
r('~')
r('(')
r(')')
r('*')
r('+')
r(',')
r('-')
r('.')
r('=')
r('?')
r('%')
r('a', 'z')
r('A', 'Z')
r('0', '9')
slices.SortFunc(ranges, func(a, b cr) int { return int(a.start - b.start) })
var prev rune = -1
for _, cr := range ranges {
if cr.start-1 > prev+1 {
ans.WriteString(regexp.QuoteMeta(string(prev + 1)))
ans.WriteRune('-')
ans.WriteString(regexp.QuoteMeta(string(cr.start - 1)))
}
prev = cr.end
}
ans.WriteString(regexp.QuoteMeta(string(ranges[len(ranges)-1].end + 1)))
ans.WriteRune('-')
ans.WriteRune(0x9f)
ans.WriteString(`\x{d800}-\x{dfff}`)
ans.WriteString(`\x{fdd0}-\x{fdef}`)
w := func(x rune) { ans.WriteRune(x) }

w(0xFFFE)
w(0xFFFF)
w(0x1FFFE)
w(0x1FFFF)
w(0x2FFFE)
w(0x2FFFF)
w(0x3FFFE)
w(0x3FFFF)
w(0x4FFFE)
w(0x4FFFF)
w(0x5FFFE)
w(0x5FFFF)
w(0x6FFFE)
w(0x6FFFF)
w(0x7FFFE)
w(0x7FFFF)
w(0x8FFFE)
w(0x8FFFF)
w(0x9FFFE)
w(0x9FFFF)
w(0xAFFFE)
w(0xAFFFF)
w(0xBFFFE)
w(0xBFFFF)
w(0xCFFFE)
w(0xCFFFF)
w(0xDFFFE)
w(0xDFFFF)
w(0xEFFFE)
w(0xEFFFF)
w(0xFFFFE)
w(0xFFFFF)

if strings.Contains(extra_excluded, "-") {
extra_excluded = strings.ReplaceAll(extra_excluded, "-", "")
extra_excluded = regexp.QuoteMeta(extra_excluded) + "-"
} else {
extra_excluded = regexp.QuoteMeta(extra_excluded)
}
ans.WriteString(extra_excluded)
return ans.String()

}

func functions_for(opts *Options) (pattern string, post_processors []PostProcessorFunc, group_processors []GroupProcessorFunc, err error) {
switch opts.Type {
case "url":
var url_prefixes *utils.Set[string]
Expand All @@ -245,7 +356,14 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
} else {
url_prefixes = utils.NewSetWithItems(strings.Split(opts.UrlPrefixes, ",")...)
}
pattern = fmt.Sprintf(`(?:%s)://[^%s]{3,}`, strings.Join(url_prefixes.AsSlice(), "|"), URL_DELIMITERS)
url_excluded_characters := RelevantKittyOpts().Url_excluded_characters
if opts.UrlExcludedCharacters != "default" {
if url_excluded_characters, err = config.StringLiteral(opts.UrlExcludedCharacters); err != nil {
err = fmt.Errorf("Failed to parse --url-excluded-characters value: %#v with error: %w", opts.UrlExcludedCharacters, err)
return
}
}
pattern = fmt.Sprintf(`(?:%s)://[^%s]{3,}`, strings.Join(url_prefixes.AsSlice(), "|"), url_excluded_characters_as_ranges_for_regex(url_excluded_characters))
post_processors = append(post_processors, PostProcessorMap()["url"])
case "path":
pattern = path_regex()
Expand Down Expand Up @@ -530,7 +648,10 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
sanitized_text, hyperlinks := process_escape_codes(text)

run_basic_matching := func() error {
pattern, post_processors, group_processors := functions_for(opts)
pattern, post_processors, group_processors, err := functions_for(opts)
if err != nil {
return err
}
r, err := regexp2.Compile(pattern, regexp2.RE2)
if err != nil {
return fmt.Errorf("Failed to compile the regex pattern: %#v with error: %w", pattern, err)
Expand Down
1 change: 1 addition & 0 deletions kittens/hints/marks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ func TestHintMarking(t *testing.T) {
reset()
u := `http://test.me/`
r(u, u)
r(u+"#fragme", u+"#fragme")
r(`"`+u+`"`, u)
r("("+u+")", u)
cols = len(u)
Expand Down
1 change: 0 additions & 1 deletion kitty/rowcolumn-diacritics.c

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions kitty/unicode-data.c

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 35 additions & 1 deletion kitty/unicode-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,43 @@ is_excluded_from_url(uint32_t ch) {
return false;
}

static inline bool
is_url_legal_char(uint32_t ch) {
START_ALLOW_CASE_RANGE
// See https://url.spec.whatwg.org/#url-code-points
if (ch < 0xa0) {
switch (ch) {
case '!': case '$': case '&': case '\'': case '/': case ':': case ';': case '@': case '_': case '~':
case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '=': case '?': case '%': case '#':
case 'a' ... 'z':
case 'A' ... 'Z':
case '0' ... '9':
return true;
default:
return false;
}
}
if (ch > 0x10fffd) return false; // outside valid unicode range
if (0xd800 <= ch && ch <= 0xdfff) return false; // leading or trailing surrogate
// non-characters
switch (ch) {
case 0xfdd0 ... 0xfdef:
case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE: case 0x2FFFF:
case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF: case 0x5FFFE: case 0x5FFFF:
case 0x6FFFE: case 0x6FFFF: case 0x7FFFE: case 0x7FFFF: case 0x8FFFE: case 0x8FFFF:
case 0x9FFFE: case 0x9FFFF: case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF:
case 0xCFFFE: case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
case 0xFFFFE: case 0xFFFFF:
return false;
default:
return true;
}
END_ALLOW_CASE_RANGE
}

static inline bool
is_url_char(uint32_t ch) {
return ch && !is_CZ_category(ch) && !is_excluded_from_url(ch);
return is_url_legal_char(ch) && !is_excluded_from_url(ch);
}

static inline bool
Expand Down
11 changes: 6 additions & 5 deletions kitty_tests/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,16 +266,17 @@ def create(t):
l0 = create('file:///etc/test')
self.ae(l0.url_start_at(0), 0)

for trail in '.,\\':
for trail in '.,\\}]>':
lx = create("http://xyz.com" + trail)
self.ae(lx.url_end_at(0), len(lx) - 2)
for trail in ')}]>':
lx = create("http://xyz.com" + trail)
self.ae(lx.url_end_at(0), len(lx) - 1)
for trail in ')':
turl = "http://xyz.com" + trail
lx = create(turl)
self.ae(len(lx) - 1, lx.url_end_at(0), repr(turl))
l0 = create("ftp://abc/")
self.ae(l0.url_end_at(0), len(l0) - 1)
l2 = create("http://-abcd] ")
self.ae(l2.url_end_at(0), len(l2) - 2)
self.ae(l2.url_end_at(0), len(l2) - 3)
l3 = create("http://ab.de ")
self.ae(l3.url_start_at(4), 0)
self.ae(l3.url_start_at(5), 0)
Expand Down
5 changes: 3 additions & 2 deletions kitty_tests/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,11 +1017,12 @@ def t(url, x=0, y=0, before='', after=''):

t('http://moo.com')
t('http://moo.com/something?else=+&what-')
t('http://moo.com#fragme')
for (st, e) in '() {} [] <>'.split():
t('http://moo.com', before=st, after=e)
for trailer in ')-=]}':
for trailer in ')-=':
t('http://moo.com' + trailer)
for trailer in '{([':
for trailer in '{([]}<>':
t('http://moo.com', after=trailer)
t('http://moo.com', x=s.columns - 9)
t('https://wraps-by-one-char.com', before='[', after=']')
Expand Down

0 comments on commit 8cc2cad

Please sign in to comment.