Permalink
Browse files

Add ccs-subs function and some unit tests for it.

  • Loading branch information...
1 parent 2a7aacc commit 5adafe4ded0d72357f60717756d170e7f98b8906 @jafingerhut committed Jan 23, 2012
Showing with 217 additions and 3 deletions.
  1. +120 −2 src/com/fingerhutpress/text/unicode.clj
  2. +97 −1 test/com/fingerhutpress/text/unicode/test.clj
@@ -5,8 +5,9 @@
(set! *warn-on-reflection* true)
-(defn bmp-codepoint? [c]
- (and (<= 0 c) (< c Character/MIN_SUPPLEMENTARY_CODE_POINT)))
+(defmacro bmp-codepoint? [c]
+ `(let [cp# ~c]
+ (and (<= 0 cp#) (< cp# Character/MIN_SUPPLEMENTARY_CODE_POINT))))
;; codepoints is a slight adaptation from a function of the same name
@@ -239,6 +240,123 @@
(throw (StringIndexOutOfBoundsException.))))))
+(defmacro combining-cp? [cp]
+ `(let [t# (Character/getType (int ~cp))]
+ (or (== t# (int Character/NON_SPACING_MARK))
+ (== t# (int Character/COMBINING_SPACING_MARK))
+ (== t# (int Character/ENCLOSING_MARK)))))
+
+
+(defmacro cp-at
+ "Return integer codepoint of character beginning at index idx of s,
+ if it lies completely within the string s. Return :eos if idx is
+ exactly at end of string, :past-eos if idx is larger than len,
+ or :incomplete-supplementary-char if there is a high surrogate at
+ the end of the string, with nothing after it.
+
+ len is simply (.length s). It is included as an argument since
+ where it is used the length has already been extracted, and it is
+ expected that it is faster to use the already-extracted length
+ rather than invoking the method repeatedly."
+ [^String s len idx]
+ `(cond (== ~idx ~len) :eos
+ (> ~idx ~len) :past-eos
+ (Character/isHighSurrogate (.charAt ~s ~idx))
+ (if (< (inc ~idx) ~len)
+ (.codePointAt ~s ~idx)
+ :incomplete-supplementary-char)
+ :else (.codePointAt ~s ~idx)))
+
+
+(defmacro cp-num-chars
+ "Return the number of Java chars, which is the number of UTF-16 code
+ units, required to encode the code point cp."
+ [cp]
+ `(if (bmp-codepoint? ~cp) 1 2))
+
+
+(defn- find-ccs-idx
+ [^String s len start-idx start-ccs-count target-ccs-count]
+ (if (== start-ccs-count target-ccs-count)
+ ;; Already done. Return immediately.
+ start-idx
+
+ ;; Special case: If first character is combining character,
+ ;; pretend that we already saw a combining character earlier by
+ ;; incrementing ccs-count. We have to check for the case that
+ ;; there is no first character while we are doing this. Implement
+ ;; this by skipping over the first character and using (inc
+ ;; start-ccs-count), whether the first character is combining or
+ ;; not.
+ (let [first-cp (cp-at s len start-idx)]
+ (if (not (number? first-cp))
+ (throw (StringIndexOutOfBoundsException.))
+ (loop [i (+ start-idx (cp-num-chars first-cp))
+ ccs-count (inc start-ccs-count)]
+ (let [cp (cp-at s len i)]
+ (cond
+ (= cp :eos)
+ (if (== ccs-count target-ccs-count)
+ i
+ (throw (StringIndexOutOfBoundsException.)))
+
+ (number? cp)
+ (cond
+ (combining-cp? cp) (recur (+ i (cp-num-chars cp)) ccs-count)
+ (== ccs-count target-ccs-count) i
+ :else (recur (+ i (cp-num-chars cp)) (inc ccs-count)))
+
+ :else (throw (StringIndexOutOfBoundsException.)))))))))
+
+
+;; TBD: Add a ccs-count function that calculates the number of CCSs in
+;; a string. If we use the same simple rule for defining a CCS as
+;; used in ccs-subs, this is as simple as counting all of the
+;; characters that match \PM in the string, plus 1 if there is a
+;; leading substring that matches \pM+.
+
+;; Creating a version of ccs-count for a more Unicode-standard version
+;; of a CCS is likely trickier.
+
+(defn ^String ccs-subs
+ "Returns the substring of s beginning at start inclusive, and ending
+ at end (defaults to the end of the string), exclusive.
+
+ Unlike subs, the start and end indices are in units of Unicode
+ combining character sequences (CCSs), not UTF-16 code units (i.e.,
+ Java chars).
+
+ For the purposes of this function, a CCS is any character except a
+ combining mark character, followed by 0 or more combining mark
+ characters. That is, it matches the regular expression:
+
+ \\PM\\pM*
+
+ This is not precisely the definition of a CCS in the Unicode 6.0.0
+ standard, but may be close enough for some purposes.
+
+ If s begins with one or more combining mark characters, all of them
+ up to, but not including, the first non-combining mark character,
+ will count as the first CCS, with index 0.
+
+ If s is a valid UTF-16 string, and start and end are in the proper
+ range, the return value is guaranteed to be a valid UTF-16 string."
+ ([^CharSequence s start]
+ (if (neg? start)
+ (throw (StringIndexOutOfBoundsException.))
+ (let [len (.length s)
+ start-idx (find-ccs-idx s len 0 0 start)]
+ (subs s start-idx))))
+ ([^CharSequence s start end]
+ (if (<= 0 start end)
+ (let [len (.length s)
+ start-idx (find-ccs-idx s len 0 0 start)
+ ;; continue looking for end-idx from where we left off
+ end-idx (find-ccs-idx s len start-idx start end)]
+ (subs s start-idx end-idx))
+ (throw (StringIndexOutOfBoundsException.)))))
+
+
(defn ^String cp-escape
"Return a new string, using cmap to escape each Unicode code point
ch from s as follows:
@@ -475,6 +475,102 @@
(is (thrown? StringIndexOutOfBoundsException (cp-subs s 0 (inc n)))))))
+(defn ^String ccs-subs-slow?
+ ([^CharSequence s start]
+ (cond (neg? start)
+ (throw (StringIndexOutOfBoundsException.))
+ (zero? start)
+ s
+ :else
+ (let [pat (re-pattern
+ (str "^(?:.\\pM*)"
+ (if (== start 1)
+ ""
+ (str "(?:\\PM\\pM*){" (dec start) "}"))))]
+ ;;(printf "ccs-subs-slow pat='%s'\n" (str pat)) (flush)
+ (if-let [remove (re-find pat s)]
+ (subs s (count remove))
+ (throw (StringIndexOutOfBoundsException.))))))
+ ([^CharSequence s start end]
+ (cond (or (neg? start) (< end start))
+ (throw (StringIndexOutOfBoundsException.))
+ (= start end)
+ ""
+ (zero? start)
+ (let [pat (re-pattern
+ (str "^(?:.\\pM*)"
+ (if (== end 1)
+ ""
+ (str "(?:\\PM\\pM*){" (dec end) "}"))))]
+ ;;(printf "ccs-subs-slow pat='%s'\n" (str pat)) (flush)
+ (if-let [s1 (re-find pat s)]
+ s1
+ (throw (StringIndexOutOfBoundsException.))))
+ :else ;; 1 <= start < end
+ (let [pat (re-pattern
+ (str "^(?:.\\pM*)"
+ (if (== start 1)
+ ""
+ (str "(?:\\PM\\pM*){" (dec start) "}"))
+ "((?:\\PM\\pM*){" (- end start) "})"))]
+ ;;(printf "ccs-subs-slow pat='%s'\n" (str pat)) (flush)
+ (if-let [[_ s1] (re-find pat s)]
+ s1
+ (throw (StringIndexOutOfBoundsException.)))))))
+
+
+(deftest test-ccs-subs
+ (let [ccs-cp-lists
+ [
+ [ 0xe007f ] ; CANCEL TAG + 0 combining chars
+ [ 0x0020 0x6df ] ; SPACE + 1 combining char
+ [ 0xa07b 0x1d1ad ] ; YI SYLLABLE NBIE + 1 combining char
+ [ 0x0041 0x0300 0x0309 ] ; A + 2 combining chars
+ [ 0x10000 0x651 0xfb1e 0xe01ef ] ; LINEAR B SYLLABLE + 3 combining
+ ]
+ ccss (vec (map (fn [cp-list] (apply str (map chr cp-list)))
+ ccs-cp-lists))
+
+ ccs-vec1 [ (ccss 0) (ccss 1) (ccss 2) (ccss 3) (ccss 4) ]
+ ccs-vec2 [ (ccss 1) (ccss 2) (ccss 3) (ccss 4) ]
+ ccs-vec3 [ (cp-subs (ccss 1) 1) ; first 1 char is combining char
+ (ccss 2) (ccss 3) (ccss 4) ]
+ ccs-vec3 [ (cp-subs (ccss 4) 1) ; first 3 chars are combining chars
+ (ccss 2) (ccss 3) (ccss 0) ]
+ ]
+ (doseq [ccs-vec [ ccs-vec1 ccs-vec2 ccs-vec3 ]]
+ (let [num-ccs (count ccs-vec)
+ s (apply str ccs-vec)]
+ ;;(printf "\n")
+ (doseq [start (range 0 (inc num-ccs))
+ end (range start (inc num-ccs))]
+;; (printf "start=%d end=%d (cp-count (ccs-subs s start end))=%d\n"
+;; start end (cp-count (ccs-subs s start end))) (flush)
+ (is (= (ccs-subs s start end)
+ (ccs-subs-slow? s start end)
+ (apply str (subvec ccs-vec start end)))))
+ (doseq [start (range 0 (inc num-ccs))]
+ ;;(printf "start=%d end=none\n" start) (flush)
+ (is (= (ccs-subs s start)
+ (ccs-subs-slow? s start)
+ (apply str (subvec ccs-vec start))))
+ ;;(printf "start=%d end=%d should be exception\n" start (inc num-ccs)) (flush)
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs s start (inc num-ccs))))
+ ;;(printf "start=%d end=%d should be exception\n" start (inc num-ccs)) (flush)
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs-slow? s start (inc num-ccs))))
+ )
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs s (inc num-ccs))))
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs-slow? s (inc num-ccs))))
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs s (inc num-ccs) (+ num-ccs 2))))
+ (is (thrown? StringIndexOutOfBoundsException
+ (ccs-subs-slow? s (inc num-ccs) (+ num-ccs 2))))))))
+
+
(defn cp-escape-slow [s cmap]
(let [strmap (reduce (fn [m [cp x]]
(assoc m (chr cp) x))
@@ -747,7 +843,7 @@
(deftest ^:write-char-types-to-file write-char-types-to-file
- (let [fname "char-type-data.txt"]
+ (let [fname "char-categories.txt"]
(with-open [f (io/writer fname :encoding "UTF-8")]
(binding [*out* f]
(print-interesting-jvm-version-properties)

0 comments on commit 5adafe4

Please sign in to comment.