Add Clojure functions for 4 Unicode normalized forms

Also add compatibility forms to one of the tests that prints out this information to a file.
jafingerhut · Jan 22, 2012 · 2a7aacc · 2a7aacc
1 parent 97bce83
commit 2a7aacc
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 33 deletions.
diff --git a/project.clj b/project.clj
@@ -3,11 +3,11 @@
   :dependencies [[org.clojure/clojure "1.3.0"]]
   :test-selectors {:default (fn [m] (and (not (:slow m))
                                          (not (:write-char-types-to-file m))
-                                         (not (:write-nfc-nfd-to-file m))
+                                         (not (:write-normalized-forms-to-file m))
                                          (not (:test-unicode-property-names m))
                                          ))
                    :slow :slow
                    :write-char-types-to-file :write-char-types-to-file
-                   :write-nfc-nfd-to-file :write-nfc-nfd-to-file
+                   :write-normalized-forms-to-file :write-normalized-forms-to-file
                    :test-unicode-property-names :test-unicode-property-names
                    :all (fn [m] true)})
diff --git a/src/com/fingerhutpress/text/unicode.clj b/src/com/fingerhutpress/text/unicode.clj
@@ -1,4 +1,5 @@
 (ns com.fingerhutpress.text.unicode
+  (:import (java.text Normalizer))
   (:require [clojure.string :as str]))
 
 (set! *warn-on-reflection* true)
@@ -39,7 +40,10 @@
 
    Repeatedly executes body, with name bound to the integer code point
    of each Unicode character in the string.  Handles Unicode
-   supplementary characters (U+10000 and above) correctly."
+   supplementary characters (U+10000 and above) correctly.
+
+   The behavior is undefined if the string is not valid UTF-16, as
+   determined by the function utf16?"
   [bindings & body]
   (assert (vector bindings))
   (assert (= 2 (count bindings)))
@@ -261,3 +265,79 @@
         (.append buffer replacement)
         (.appendCodePoint buffer c)))
     (.toString buffer)))
+
+
+(defn NFC
+  "Return a string that is in Unicode Normalization Form C (NFC),
+   by doing canonical decomposition, followed by canonical
+   composition, on the input string.
+
+   Every character that can be represented as either a single combined
+   Unicode code point (e.g. a Latin A with an acute accent), or as a
+   base character followed by one or more combining characters (e.g. a
+   Latin A character, followed by a combining character for an acute
+   accent), is turned into its decomposed form (e.g. the second form),
+   where the combining characters are sorted into a standard-specified
+   order, and then transformed into its composed form (e.g. like the
+   first example form).
+
+   For any two Unicode strings s1 and s2 that are canonically
+   equivalent, (= (NFC s1) (NFC s2)) will be true, even if (= s1 s2)
+   is false.
+
+   See also: http://unicode.org/reports/tr15/
+             http://en.wikipedia.org/wiki/Unicode_equivalence"
+  [^CharSequence s]
+  (Normalizer/normalize s java.text.Normalizer$Form/NFC))
+
+
+(defn NFD
+  "Return a string that is in Unicode Normalization Form D (NFD),
+   by doing canonical decomposition on the input string.
+
+   Every character that can be represented as either a single combined
+   Unicode code point (e.g. a Latin A with an acute accent), or as a
+   base character followed by one or more combining characters (e.g. a
+   Latin A character, followed by a combining character for an acute
+   accent), is turned into its decomposed form (e.g. the second form),
+   where the combining characters are sorted into a standard-specified
+   order.
+
+   For any two Unicode strings s1 and s2 that are canonically
+   equivalent, (= (NFD s1) (NFD s2)) will be true, even if (= s1 s2)
+   is false.
+
+   See also: http://unicode.org/reports/tr15/
+             http://en.wikipedia.org/wiki/Unicode_equivalence"
+  [^CharSequence s]
+  (Normalizer/normalize s java.text.Normalizer$Form/NFD))
+
+
+(defn NFKC
+  "Return a string that is in Unicode Normalization Form KC (NFKC),
+   by doing compatibility decomposition, followed by compatibility
+   composition, on the input string.
+
+   For any two Unicode strings s1 and s2 that are compatibility
+   equivalent, (= (NFKC s1) (NFKC s2)) will be true, even if (= s1 s2)
+   is false.
+
+   See also: http://unicode.org/reports/tr15/
+             http://en.wikipedia.org/wiki/Unicode_equivalence"
+  [^CharSequence s]
+  (Normalizer/normalize s java.text.Normalizer$Form/NFKC))
+
+
+(defn NFKD
+  "Return a string that is in Unicode Normalization Form KD (NFKD),
+   by doing compatibility decomposition, followed by compatibility
+   composition, on the input string.
+
+   For any two Unicode strings s1 and s2 that are compatibility
+   equivalent, (= (NFKD s1) (NFKD s2)) will be true, even if (= s1 s2)
+   is false.
+
+   See also: http://unicode.org/reports/tr15/
+             http://en.wikipedia.org/wiki/Unicode_equivalence"
+  [^CharSequence s]
+  (Normalizer/normalize s java.text.Normalizer$Form/NFKD))
diff --git a/test/com/fingerhutpress/text/unicode/test.clj b/test/com/fingerhutpress/text/unicode/test.clj
@@ -1,8 +1,7 @@
 (ns com.fingerhutpress.text.unicode.test
   (:use [com.fingerhutpress.text.unicode])
   (:use [clojure.test])
-  (:import (java.util.regex PatternSyntaxException)
-           (java.text Normalizer))
+  (:import (java.util.regex PatternSyntaxException))
   (:require [clojure.string :as str]
             [clojure.set :as set]
             [clojure.java.io :as io]
@@ -823,13 +822,16 @@
         ;; Mac OS X 10.6.8, the \p{InFoo} syntax in a regular
         ;; expression appears to correspond almost exactly with the
         ;; Block specifications from Blocks.txt in Unicode 4.1.0.  The
-        ;; only difference is that there are a few Block names in this
-        ;; Blocks.txt file that are not supported by this JVM.
+        ;; only difference is that there are a few Block names in that
+        ;; version of the Blocks.txt file that are not supported by
+        ;; this JVM.
 
-        ;; The Scripts.txt file above specifies script names, many of
-        ;; which correspond with Block names, but specify very
+        ;; The Scripts.txt files above specifies script names, many of
+        ;; which have the same name as Block names, but specify very
         ;; different sets of characters.
-        "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
+        "http://unicode.org/Public/4.1.0/ucd/Blocks.txt"
+        ;; local copy on my machine:
+        ;; "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
 
         out-fname "unicode-property-names-test-out.txt"
         num-all-cps (count (all-codepoints))]
@@ -925,34 +927,59 @@
                     (printf "Regex %s is NOT legal\n" re-string)))))))))))
 
 
-(deftest ^:write-nfc-nfd-to-file
-  write-nfc-nfd-to-file
-  (let [fname "nfc-nfd-data.txt"]
+(deftest ^:write-normalized-forms-to-file
+  write-normalized-forms-to-file
+  (let [fname "normalized-form-data.txt"]
     (with-open [f (io/writer fname :encoding "UTF-8")]
       (binding [*out* f]
         (print-interesting-jvm-version-properties)
         (printf "\n")
         (let [normalized-forms
               (->> (all-codepoints)
                    (map (fn [i] {:cp i :str (chr i)}))
-                   (map (fn [m]
-                          (assoc m
-                            :nfc (Normalizer/normalize (:str m)
-                                                       java.text.Normalizer$Form/NFC)
-                            :nfd (Normalizer/normalize (:str m)
-                                                       java.text.Normalizer$Form/NFD)))))]
-          (printf "hex-codepoint;string S, containing that code point and nothing else;max # of codepoints in either NFC or NFD of S;hex-codepoints of NFC(S), if different from S, otherwise empty;NFC(S);hex-codepoints of NFD(S), if different from S, otherwise empty;NFD(S)\n")
+                   (map (fn [m] (assoc m
+                                  :nfc (NFC (:str m))
+                                  :nfd (NFD (:str m))
+                                  :nfkc (NFKC (:str m))
+                                  :nfkd (NFKD (:str m))))))]
+          (printf "hex-codepoint
+;string S, containing that code point and nothing else
+;max # of codepoints in either NFC or NFD of S
+;hex-codepoints of NFC(S), if different from S, otherwise empty
+;NFC(S)
+;hex-codepoints of NFD(S), if different from S, otherwise empty
+;NFD(S)
+;max # of codepoints in either NFKC or NFKD of S
+;hex-codepoints of NFKC(S), if different from S, otherwise empty
+;NFKC(S)
+;hex-codepoints of NFKD(S), if different from S, otherwise empty
+;NFKD(S)
+
+")
           (doseq [m normalized-forms]
-            (when (not (= (:str m) (:nfc m) (:nfd m)))
-              (printf "%06X;%s;%d;%s;%s;%s;%s\n" (:cp m)
-                      (:str m)
-                      (max (cp-count (:nfc m)) (cp-count (:nfd m)))
-                      (if (= (:str m) (:nfc m))
-                        "" (hex-codepoint-str (:nfc m)))
-                      (if (= (:str m) (:nfc m))
-                        "" (:nfc m))
-                      (if (= (:str m) (:nfd m))
-                        "" (hex-codepoint-str (:nfd m)))
-                      (if (= (:str m) (:nfd m))
-                        "" (:nfd m))
-                      ))))))))
+            (when (not (= (:str m) (:nfc m) (:nfd m) (:nfkc m) (:nfkd m)))
+              (printf "%s"
+               (str (format "%06X" (:cp m))
+                    (format ";%s" (:str m))
+                    (format ";%d" (max (cp-count (:nfc m)) (cp-count (:nfd m))))
+                    (format ";%s" (if (= (:str m) (:nfc m))
+                                    "" (hex-codepoint-str (:nfc m))))
+                    (format ";%s" (if (= (:str m) (:nfc m))
+                                    "" (:nfc m)))
+                    (format ";%s" (if (= (:str m) (:nfd m))
+                                    "" (hex-codepoint-str (:nfd m))))
+                    (format ";%s" (if (= (:str m) (:nfd m))
+                                    "" (:nfd m)))
+
+                    (format ";%d" (max (cp-count (:nfkc m)) (cp-count (:nfkd m))))
+                    (format ";%s" (if (= (:str m) (:nfkc m))
+                                    "" (hex-codepoint-str (:nfkc m))))
+                    (format ";%s" (if (= (:str m) (:nfkc m))
+                                    "" (:nfkc m)))
+                    (format ";%s" (if (= (:str m) (:nfkd m))
+                                    "" (hex-codepoint-str (:nfkd m))))
+                    (format ";%s" (if (= (:str m) (:nfkd m))
+                                    "" (:nfkd m)))
+                    "\n")
+
+               ))))))))