-
Notifications
You must be signed in to change notification settings - Fork 21
/
normalize.cljc
163 lines (146 loc) · 6.07 KB
/
normalize.cljc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
(ns lambdaisland.uri.normalize
(:require [clojure.string :as str]
[lambdaisland.uri.platform :refer [byte-seq->string
string->byte-seq
byte->hex hex->byte
char-code-at
str-len]]))
;; TODO we might be better off having these just be sets
(def
^{:doc
"Which characters should be percent-encoded depends on which section
of the URI is being normalized. This map contains regexes that for each
case match the characters that need encoding."}
character-classes
(let [alpha "a-zA-Z"
digit "0-9"
gen-delims ":\\/\\?#\\[\\]@"
sub-delims "!\\$&'\\(\\)\\*\\+,;="
reserved (str gen-delims sub-delims)
unreserved (str alpha digit "\\-\\._~")
pchar (str unreserved sub-delims ":@")
scheme (str alpha digit "\\-\\+\\.")
host (str unreserved sub-delims "\\[:\\]")
authority pchar
path (str pchar "\\/")
query (str unreserved ":@\\/\\?")
fragment (str pchar "\\/\\?")]
{:alpha (re-pattern (str "[^" alpha "]"))
:digit (re-pattern (str "[^" digit "]"))
:gen-delims (re-pattern (str "[^" gen-delims "]"))
:sub-delims (re-pattern (str "[^" sub-delims "]"))
:reserved (re-pattern (str "[^" reserved "]"))
:unreserved (re-pattern (str "[^" unreserved "]"))
:pchar (re-pattern (str "[^" pchar "]"))
:scheme (re-pattern (str "[^" scheme "]"))
:host (re-pattern (str "[^" host "]"))
:authority (re-pattern (str "[^" authority "]"))
:path (re-pattern (str "[^" path "]"))
:query (re-pattern (str "[^" query "]"))
:fragment (re-pattern (str "[^" fragment "]"))}))
(defn high-surrogate? [char-code]
(<= 0xD800 char-code 0xDBFF))
(defn char-seq
"Return a seq of the characters in a string, making sure not to split up
UCS-2 (or is it UTF-16?) surrogate pairs. Because JavaScript. And Java."
([str]
(char-seq str 0))
([str offset]
(loop [offset offset
res []]
(if (>= offset (str-len str))
res
(let [code (char-code-at str offset)
width (if (high-surrogate? code) 2 1)
next-offset (+ offset width)
cur-char (subs str offset next-offset)]
(recur next-offset
(conj res cur-char)))))))
(defn percent-encode
"Convert characters in their percent encoded form. e.g.
`(percent_encode \"a\") #_=> \"%61\"`. When given a second argument, then
only characters of the given character class are encoded,
see `character-class`.
Characters are encoded as UTF-8. To use a different encoding. re-bind
`*character-encoding*`"
([component]
(->> (string->byte-seq component)
(map #(str "%" (byte->hex %)))
(apply str)))
([component type]
(let [char-class (get character-classes type)
encode-char #(cond-> % (re-find char-class %) percent-encode)]
(->> (char-seq component)
(map encode-char)
(apply str)))))
(defn percent-decode
"The inverse of `percent-encode`, convert any %XX sequences in a string to
characters. Byte sequences are interpreted as UTF-8. To use a different
encoding. re-bind `*character-encoding*`"
[s]
(when s
(str/replace s #"(%[0-9A-Fa-f]{2})+"
(fn [[x _]]
(byte-seq->string
(->> (str/split x #"%")
(drop 1)
(map hex->byte)))))))
(defn normalize-path [path]
(when-not (nil? path)
(percent-encode (percent-decode path) :path)))
(defn normalize-fragment [fragment]
(when-not (nil? fragment)
(percent-encode (percent-decode fragment) :fragment)))
(defn hex-code-point? [cp]
(or (<= #_(long \0) 48 cp #_(long \9) 57)
(<= #_(long \A) 65 cp #_(long \F) 70)
(<= #_(long \a) 97 cp #_(long \f) 102)))
(def sub-delims
"RFC3986 section 2.2
The purpose of reserved characters is to provide a set of delimiting
characters that are distinguishable from other data within a URI.
URIs that differ in the replacement of a reserved character with its
corresponding percent-encoded octet are not equivalent. Percent-
encoding a reserved character, or decoding a percent-encoded octet
that corresponds to a reserved character, will change how the URI is
interpreted by most applications. Thus, characters in the reserved
set are protected from normalization and are therefore safe to be
used by scheme-specific and producer-specific algorithms for
delimiting data subcomponents within a URI. "
#{"!" "$" "&" "'" "(" ")" "*" "+" "," ";" "="})
(defn normalize-query
"Normalize the query section of a URI
- sub-delimiters that are not percent encoded are left unencoded
- sub-delimiters and other reserved characters are always percent encoded
- non-reserved characters that are percent-encoded are decoded
"
[s]
(when s
(let [len (str-len s)]
(loop [i 0
res []]
(cond
(= i len)
(apply str res)
(and (< i (- len 2))
(= 37 (char-code-at s i))
(hex-code-point? (char-code-at s (inc i)))
(hex-code-point? (char-code-at s (+ i 2))))
(recur (+ i 3)
(conj res (percent-encode (percent-decode (subs s i (+ i 3)))
:query)))
(contains? sub-delims (subs s i (inc i)))
(recur (inc i)
(conj res (subs s i (inc i))))
:else
(let [increment (if (high-surrogate? (char-code-at s i)) 2 1)]
(recur (+ i increment)
(conj res (percent-encode (subs s i (+ i increment)) :query)))))))))
(defn normalize
"Normalize a lambdaisland.uri.URI. Currently normalizes (percent-encodes) the
path, query, and fragment."
[uri]
(-> uri
(update :path normalize-path)
(update :query normalize-query)
(update :fragment normalize-fragment)))