Permalink
Browse files

Add bad-surrogate-at-either-end? and unit tests

  • Loading branch information...
1 parent 5adafe4 commit f23773afc7f25afc1be5190917eae2dd6308ab23 @jafingerhut committed Jan 26, 2012
Showing with 40 additions and 0 deletions.
  1. +17 −0 src/com/fingerhutpress/text/unicode.clj
  2. +23 −0 test/com/fingerhutpress/text/unicode/test.clj
View
17 src/com/fingerhutpress/text/unicode.clj
@@ -120,6 +120,23 @@
(nil? (first-utf16-error s)))
+(defn bad-surrogate-at-either-end?
+ "Return true if s is not empty, and either begins with a trailing
+ surrogate (aka low surrogate), or ends with a leading
+ surrogate (aka high surrogate). Intended for use as a quick
+ error-catching mechanism when a substring of a UTF-16 encoded
+ string is taken with a bad start or end location that splits up a
+ surrogate pair. To be quick, it intentionally does not scan the
+ entire string the way first-utf16-error or utf16? do, but only
+ checks the ends."
+ [^CharSequence s]
+ (cond
+ (= s "") false
+ (Character/isLowSurrogate (.charAt s 0)) true
+ (Character/isHighSurrogate (.charAt s (dec (.length s)))) true
+ :else false))
+
+
(defn ^String escape-supp
"Return a new string, replacing Unicode supplementary characters (or
code points), which require 2 Java chars to represent, with a
View
23 test/com/fingerhutpress/text/unicode/test.clj
@@ -239,6 +239,29 @@
)
+(deftest test-bad-surrogate-at-either-end?
+ (let [f (fn [expected-answer s]
+ (= expected-answer (bad-surrogate-at-either-end? s)))]
+ (doseq [s valid-utf16-strings]
+ (is (f false s)))
+ (is (f false "\u0300 combining grave accent (not a surrogate)"))
+ (is (f false "\uD83D only leading surrogate"))
+ (is (f true "\uD83D"))
+ (is (f true "\uDE03 only trailing surrogate"))
+ (is (f true "\uDE03"))
+ (is (f false "\uD83D\uDE03"))
+ (is (f true "only leading surrogate \uD83D"))
+ (is (f false (str "two consecutive "
+ MIN_LEADING_SURROGATE_STR
+ MAX_LEADING_SURROGATE_STR
+ " leading surrogates")))
+ (is (f false (str "two consecutive "
+ MIN_TRAILING_SURROGATE_STR
+ MAX_TRAILING_SURROGATE_STR
+ " trailing surrogates")))
+ ))
+
+
(deftest test-escape-supp
(doseq [s valid-utf16-strings]
(if (contains-supp? s)

0 comments on commit f23773a

Please sign in to comment.