Skip to content

Commit

Permalink
Add Clojure functions for 4 Unicode normalized forms
Browse files Browse the repository at this point in the history
Also add compatibility forms to one of the tests that prints out this
information to a file.
  • Loading branch information
jafingerhut committed Jan 22, 2012
1 parent 97bce83 commit 2a7aacc
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 33 deletions.
4 changes: 2 additions & 2 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
:dependencies [[org.clojure/clojure "1.3.0"]]
:test-selectors {:default (fn [m] (and (not (:slow m))
(not (:write-char-types-to-file m))
(not (:write-nfc-nfd-to-file m))
(not (:write-normalized-forms-to-file m))
(not (:test-unicode-property-names m))
))
:slow :slow
:write-char-types-to-file :write-char-types-to-file
:write-nfc-nfd-to-file :write-nfc-nfd-to-file
:write-normalized-forms-to-file :write-normalized-forms-to-file
:test-unicode-property-names :test-unicode-property-names
:all (fn [m] true)})
82 changes: 81 additions & 1 deletion src/com/fingerhutpress/text/unicode.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
(ns com.fingerhutpress.text.unicode
(:import (java.text Normalizer))
(:require [clojure.string :as str]))

(set! *warn-on-reflection* true)
Expand Down Expand Up @@ -39,7 +40,10 @@
Repeatedly executes body, with name bound to the integer code point
of each Unicode character in the string. Handles Unicode
supplementary characters (U+10000 and above) correctly."
supplementary characters (U+10000 and above) correctly.
The behavior is undefined if the string is not valid UTF-16, as
determined by the function utf16?"
[bindings & body]
(assert (vector bindings))
(assert (= 2 (count bindings)))
Expand Down Expand Up @@ -261,3 +265,79 @@
(.append buffer replacement)
(.appendCodePoint buffer c)))
(.toString buffer)))


(defn NFC
"Return a string that is in Unicode Normalization Form C (NFC),
by doing canonical decomposition, followed by canonical
composition, on the input string.
Every character that can be represented as either a single combined
Unicode code point (e.g. a Latin A with an acute accent), or as a
base character followed by one or more combining characters (e.g. a
Latin A character, followed by a combining character for an acute
accent), is turned into its decomposed form (e.g. the second form),
where the combining characters are sorted into a standard-specified
order, and then transformed into its composed form (e.g. like the
first example form).
For any two Unicode strings s1 and s2 that are canonically
equivalent, (= (NFC s1) (NFC s2)) will be true, even if (= s1 s2)
is false.
See also: http://unicode.org/reports/tr15/
http://en.wikipedia.org/wiki/Unicode_equivalence"
[^CharSequence s]
(Normalizer/normalize s java.text.Normalizer$Form/NFC))


(defn NFD
"Return a string that is in Unicode Normalization Form D (NFD),
by doing canonical decomposition on the input string.
Every character that can be represented as either a single combined
Unicode code point (e.g. a Latin A with an acute accent), or as a
base character followed by one or more combining characters (e.g. a
Latin A character, followed by a combining character for an acute
accent), is turned into its decomposed form (e.g. the second form),
where the combining characters are sorted into a standard-specified
order.
For any two Unicode strings s1 and s2 that are canonically
equivalent, (= (NFD s1) (NFD s2)) will be true, even if (= s1 s2)
is false.
See also: http://unicode.org/reports/tr15/
http://en.wikipedia.org/wiki/Unicode_equivalence"
[^CharSequence s]
(Normalizer/normalize s java.text.Normalizer$Form/NFD))


(defn NFKC
"Return a string that is in Unicode Normalization Form KC (NFKC),
by doing compatibility decomposition, followed by compatibility
composition, on the input string.
For any two Unicode strings s1 and s2 that are compatibility
equivalent, (= (NFKC s1) (NFKC s2)) will be true, even if (= s1 s2)
is false.
See also: http://unicode.org/reports/tr15/
http://en.wikipedia.org/wiki/Unicode_equivalence"
[^CharSequence s]
(Normalizer/normalize s java.text.Normalizer$Form/NFKC))


(defn NFKD
"Return a string that is in Unicode Normalization Form KD (NFKD),
by doing compatibility decomposition, followed by compatibility
composition, on the input string.
For any two Unicode strings s1 and s2 that are compatibility
equivalent, (= (NFKD s1) (NFKD s2)) will be true, even if (= s1 s2)
is false.
See also: http://unicode.org/reports/tr15/
http://en.wikipedia.org/wiki/Unicode_equivalence"
[^CharSequence s]
(Normalizer/normalize s java.text.Normalizer$Form/NFKD))
87 changes: 57 additions & 30 deletions test/com/fingerhutpress/text/unicode/test.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
(ns com.fingerhutpress.text.unicode.test
(:use [com.fingerhutpress.text.unicode])
(:use [clojure.test])
(:import (java.util.regex PatternSyntaxException)
(java.text Normalizer))
(:import (java.util.regex PatternSyntaxException))
(:require [clojure.string :as str]
[clojure.set :as set]
[clojure.java.io :as io]
Expand Down Expand Up @@ -823,13 +822,16 @@
;; Mac OS X 10.6.8, the \p{InFoo} syntax in a regular
;; expression appears to correspond almost exactly with the
;; Block specifications from Blocks.txt in Unicode 4.1.0. The
;; only difference is that there are a few Block names in this
;; Blocks.txt file that are not supported by this JVM.
;; only difference is that there are a few Block names in that
;; version of the Blocks.txt file that are not supported by
;; this JVM.

;; The Scripts.txt file above specifies script names, many of
;; which correspond with Block names, but specify very
;; The Scripts.txt files above specifies script names, many of
;; which have the same name as Block names, but specify very
;; different sets of characters.
"/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
"http://unicode.org/Public/4.1.0/ucd/Blocks.txt"
;; local copy on my machine:
;; "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"

out-fname "unicode-property-names-test-out.txt"
num-all-cps (count (all-codepoints))]
Expand Down Expand Up @@ -925,34 +927,59 @@
(printf "Regex %s is NOT legal\n" re-string)))))))))))


(deftest ^:write-nfc-nfd-to-file
write-nfc-nfd-to-file
(let [fname "nfc-nfd-data.txt"]
(deftest ^:write-normalized-forms-to-file
write-normalized-forms-to-file
(let [fname "normalized-form-data.txt"]
(with-open [f (io/writer fname :encoding "UTF-8")]
(binding [*out* f]
(print-interesting-jvm-version-properties)
(printf "\n")
(let [normalized-forms
(->> (all-codepoints)
(map (fn [i] {:cp i :str (chr i)}))
(map (fn [m]
(assoc m
:nfc (Normalizer/normalize (:str m)
java.text.Normalizer$Form/NFC)
:nfd (Normalizer/normalize (:str m)
java.text.Normalizer$Form/NFD)))))]
(printf "hex-codepoint;string S, containing that code point and nothing else;max # of codepoints in either NFC or NFD of S;hex-codepoints of NFC(S), if different from S, otherwise empty;NFC(S);hex-codepoints of NFD(S), if different from S, otherwise empty;NFD(S)\n")
(map (fn [m] (assoc m
:nfc (NFC (:str m))
:nfd (NFD (:str m))
:nfkc (NFKC (:str m))
:nfkd (NFKD (:str m))))))]
(printf "hex-codepoint
;string S, containing that code point and nothing else
;max # of codepoints in either NFC or NFD of S
;hex-codepoints of NFC(S), if different from S, otherwise empty
;NFC(S)
;hex-codepoints of NFD(S), if different from S, otherwise empty
;NFD(S)
;max # of codepoints in either NFKC or NFKD of S
;hex-codepoints of NFKC(S), if different from S, otherwise empty
;NFKC(S)
;hex-codepoints of NFKD(S), if different from S, otherwise empty
;NFKD(S)
")
(doseq [m normalized-forms]
(when (not (= (:str m) (:nfc m) (:nfd m)))
(printf "%06X;%s;%d;%s;%s;%s;%s\n" (:cp m)
(:str m)
(max (cp-count (:nfc m)) (cp-count (:nfd m)))
(if (= (:str m) (:nfc m))
"" (hex-codepoint-str (:nfc m)))
(if (= (:str m) (:nfc m))
"" (:nfc m))
(if (= (:str m) (:nfd m))
"" (hex-codepoint-str (:nfd m)))
(if (= (:str m) (:nfd m))
"" (:nfd m))
))))))))
(when (not (= (:str m) (:nfc m) (:nfd m) (:nfkc m) (:nfkd m)))
(printf "%s"
(str (format "%06X" (:cp m))
(format ";%s" (:str m))
(format ";%d" (max (cp-count (:nfc m)) (cp-count (:nfd m))))
(format ";%s" (if (= (:str m) (:nfc m))
"" (hex-codepoint-str (:nfc m))))
(format ";%s" (if (= (:str m) (:nfc m))
"" (:nfc m)))
(format ";%s" (if (= (:str m) (:nfd m))
"" (hex-codepoint-str (:nfd m))))
(format ";%s" (if (= (:str m) (:nfd m))
"" (:nfd m)))

(format ";%d" (max (cp-count (:nfkc m)) (cp-count (:nfkd m))))
(format ";%s" (if (= (:str m) (:nfkc m))
"" (hex-codepoint-str (:nfkc m))))
(format ";%s" (if (= (:str m) (:nfkc m))
"" (:nfkc m)))
(format ";%s" (if (= (:str m) (:nfkd m))
"" (hex-codepoint-str (:nfkd m))))
(format ";%s" (if (= (:str m) (:nfkd m))
"" (:nfkd m)))
"\n")

))))))))

0 comments on commit 2a7aacc

Please sign in to comment.