Browse files

Add escape-supp

  • Loading branch information...
1 parent 03c2f0f commit c8fb2ec25a48f7858700b2651b0bb7b297811a35 @jafingerhut committed Jan 18, 2012
Showing with 91 additions and 1 deletion.
  1. +42 −1 src/com/fingerhutpress/text/unicode.clj
  2. +49 −0 test/com/fingerhutpress/text/unicode/test.clj
View
43 src/com/fingerhutpress/text/unicode.clj
@@ -110,6 +110,40 @@
(nil? (first-utf16-error s)))
+(defn ^String escape-supp
+ "Return a new string, replacing Unicode supplementary characters (or
+ code points), which require 2 Java chars to represent, with a
+ string of the form <U+XXXXXX>, where XXXXXX is the hexadecimal code
+ point value. Do the same (except only use 4 hex digits) for
+ unpaired surrogate characters, which should never appear in valid
+ Unicode strings encoded as UTF-16."
+ [^CharSequence s]
+ (let [len (count s)
+ buffer (StringBuilder. len)]
+ (loop [i 0]
+ (if (< i len)
+ (let [c (.charAt s i)]
+ (cond (Character/isHighSurrogate c)
+ (let [i+1 (inc i)]
+ (if (and (< i+1 len)
+ (Character/isLowSurrogate (.charAt s i+1)))
+ (do
+ (.append buffer (format "<U+%06X>" (.codePointAt s i)))
+ (recur (+ i 2)))
+ (do
+ (.append buffer (format "<U+%04X>" (int c)))
+ (recur (inc i)))))
+ (Character/isLowSurrogate c)
+ (do
+ (.append buffer (format "<U+%04X>" (int c)))
+ (recur (inc i)))
+ :else
+ (do
+ (.append buffer c)
+ (recur (inc i)))))
+ (.toString buffer)))))
+
+
(defn contains-supp?
"Returns logical true (see below) if the string or CharSequence s
contains supplementary characters, outside the Basic Multilingual
@@ -206,7 +240,14 @@
or strings.
The behavior is undefined if s is not a valid UTF-16 string, as
- determined by function utf16?"
+ determined by function utf16?
+
+ Note that while clojure.string/escape is similar, it escapes UTF-16
+ code units, or Java chars. If you wish to escape a Unicode
+ supplementary character, which requires 2 Java chars to represent,
+ clojure.string/escape can escape those two Java chars
+ independently, but not as a unit. cp-escape can escape them as a
+ unit."
[^CharSequence s cmap]
(let [buffer (StringBuilder. (count s))]
(docodepoints [c s]
View
49 test/com/fingerhutpress/text/unicode/test.clj
@@ -214,6 +214,55 @@
)
+(deftest test-escape-supp
+ (doseq [s valid-utf16-strings]
+ (if (contains-supp? s)
+ ;; Replace supplementary characters that might be there with
+ ;; their escapes using a different slower method.
+ (let [s2 (-> s
+ (str/replace (re-pattern MUSICAL_SYMBOL_G_CLEF_STR)
+ (format "<U+%06X>" (ord MUSICAL_SYMBOL_G_CLEF_STR)))
+ (str/replace (re-pattern SMILING_FACE_WITH_OPEN_MOUTH_STR)
+ (format "<U+%06X>" (ord SMILING_FACE_WITH_OPEN_MOUTH_STR)))
+ (str/replace (re-pattern BABY_ANGEL_STR)
+ (format "<U+%06X>" (ord BABY_ANGEL_STR)))
+ (str/replace (re-pattern MIN_SUPPLEMENTARY_CODE_POINT_STR)
+ (format "<U+%06X>" (ord MIN_SUPPLEMENTARY_CODE_POINT_STR)))
+ (str/replace (re-pattern MAX_CODE_POINT_STR)
+ (format "<U+%06X>" (ord MAX_CODE_POINT_STR)))
+ )]
+ (is (= s2 (escape-supp s))))
+ ;; If there are no supplementary characters, escape-supp should
+ ;; return the original string.
+ (is (= s (escape-supp s)))))
+
+ (is (= "\u0300 combining grave accent (not a surrogate)"
+ (escape-supp "\u0300 combining grave accent (not a surrogate)")))
+ (is (= "<U+D83D> only leading surrogate"
+ (escape-supp "\uD83D only leading surrogate")))
+ (is (= "<U+DE03> only trailing surrogate"
+ (escape-supp "\uDE03 only trailing surrogate")))
+ (is (= "only leading surrogate <U+D83D>"
+ (escape-supp "only leading surrogate \uD83D")))
+ (is (= (str "two consecutive "
+ (format "<U+%04X>" (int (.charAt MIN_LEADING_SURROGATE_STR 0)))
+ (format "<U+%04X>" (int (.charAt MAX_LEADING_SURROGATE_STR 0)))
+ " leading surrogates")
+ (escape-supp (str "two consecutive "
+ MIN_LEADING_SURROGATE_STR
+ MAX_LEADING_SURROGATE_STR
+ " leading surrogates"))))
+ (is (= (str "two consecutive "
+ (format "<U+%04X>" (int (.charAt MIN_TRAILING_SURROGATE_STR 0)))
+ (format "<U+%04X>" (int (.charAt MAX_TRAILING_SURROGATE_STR 0)))
+ " trailing surrogates")
+ (escape-supp (str "two consecutive "
+ MIN_TRAILING_SURROGATE_STR
+ MAX_TRAILING_SURROGATE_STR
+ " trailing surrogates"))))
+ )
+
+
(deftest test-codepoints
(is (= "61 300 1234 4567 1b1b 1d11e"
(hex-codeunit-str

0 comments on commit c8fb2ec

Please sign in to comment.