Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add Clojure functions for 4 Unicode normalized forms

Also add compatibility forms to one of the tests that prints out this
information to a file.
  • Loading branch information...
commit 2a7aacc6f8ccff33fddeeeb7a9050755b25c26c9 1 parent 97bce83
@jafingerhut authored
View
4 project.clj
@@ -3,11 +3,11 @@
:dependencies [[org.clojure/clojure "1.3.0"]]
:test-selectors {:default (fn [m] (and (not (:slow m))
(not (:write-char-types-to-file m))
- (not (:write-nfc-nfd-to-file m))
+ (not (:write-normalized-forms-to-file m))
(not (:test-unicode-property-names m))
))
:slow :slow
:write-char-types-to-file :write-char-types-to-file
- :write-nfc-nfd-to-file :write-nfc-nfd-to-file
+ :write-normalized-forms-to-file :write-normalized-forms-to-file
:test-unicode-property-names :test-unicode-property-names
:all (fn [m] true)})
View
82 src/com/fingerhutpress/text/unicode.clj
@@ -1,4 +1,5 @@
(ns com.fingerhutpress.text.unicode
+ (:import (java.text Normalizer))
(:require [clojure.string :as str]))
(set! *warn-on-reflection* true)
@@ -39,7 +40,10 @@
Repeatedly executes body, with name bound to the integer code point
of each Unicode character in the string. Handles Unicode
- supplementary characters (U+10000 and above) correctly."
+ supplementary characters (U+10000 and above) correctly.
+
+ The behavior is undefined if the string is not valid UTF-16, as
+ determined by the function utf16?"
[bindings & body]
(assert (vector bindings))
(assert (= 2 (count bindings)))
@@ -261,3 +265,79 @@
(.append buffer replacement)
(.appendCodePoint buffer c)))
(.toString buffer)))
+
+
+(defn NFC
+ "Return a string that is in Unicode Normalization Form C (NFC),
+ by doing canonical decomposition, followed by canonical
+ composition, on the input string.
+
+ Every character that can be represented as either a single combined
+ Unicode code point (e.g. a Latin A with an acute accent), or as a
+ base character followed by one or more combining characters (e.g. a
+ Latin A character, followed by a combining character for an acute
+ accent), is turned into its decomposed form (e.g. the second form),
+ where the combining characters are sorted into a standard-specified
+ order, and then transformed into its composed form (e.g. like the
+ first example form).
+
+ For any two Unicode strings s1 and s2 that are canonically
+ equivalent, (= (NFC s1) (NFC s2)) will be true, even if (= s1 s2)
+ is false.
+
+ See also: http://unicode.org/reports/tr15/
+ http://en.wikipedia.org/wiki/Unicode_equivalence"
+ [^CharSequence s]
+ (Normalizer/normalize s java.text.Normalizer$Form/NFC))
+
+
+(defn NFD
+ "Return a string that is in Unicode Normalization Form D (NFD),
+ by doing canonical decomposition on the input string.
+
+ Every character that can be represented as either a single combined
+ Unicode code point (e.g. a Latin A with an acute accent), or as a
+ base character followed by one or more combining characters (e.g. a
+ Latin A character, followed by a combining character for an acute
+ accent), is turned into its decomposed form (e.g. the second form),
+ where the combining characters are sorted into a standard-specified
+ order.
+
+ For any two Unicode strings s1 and s2 that are canonically
+ equivalent, (= (NFD s1) (NFD s2)) will be true, even if (= s1 s2)
+ is false.
+
+ See also: http://unicode.org/reports/tr15/
+ http://en.wikipedia.org/wiki/Unicode_equivalence"
+ [^CharSequence s]
+ (Normalizer/normalize s java.text.Normalizer$Form/NFD))
+
+
+(defn NFKC
+ "Return a string that is in Unicode Normalization Form KC (NFKC),
+ by doing compatibility decomposition, followed by compatibility
+ composition, on the input string.
+
+ For any two Unicode strings s1 and s2 that are compatibility
+ equivalent, (= (NFKC s1) (NFKC s2)) will be true, even if (= s1 s2)
+ is false.
+
+ See also: http://unicode.org/reports/tr15/
+ http://en.wikipedia.org/wiki/Unicode_equivalence"
+ [^CharSequence s]
+ (Normalizer/normalize s java.text.Normalizer$Form/NFKC))
+
+
+(defn NFKD
+ "Return a string that is in Unicode Normalization Form KD (NFKD),
+ by doing compatibility decomposition, followed by compatibility
+ composition, on the input string.
+
+ For any two Unicode strings s1 and s2 that are compatibility
+ equivalent, (= (NFKD s1) (NFKD s2)) will be true, even if (= s1 s2)
+ is false.
+
+ See also: http://unicode.org/reports/tr15/
+ http://en.wikipedia.org/wiki/Unicode_equivalence"
+ [^CharSequence s]
+ (Normalizer/normalize s java.text.Normalizer$Form/NFKD))
View
87 test/com/fingerhutpress/text/unicode/test.clj
@@ -1,8 +1,7 @@
(ns com.fingerhutpress.text.unicode.test
(:use [com.fingerhutpress.text.unicode])
(:use [clojure.test])
- (:import (java.util.regex PatternSyntaxException)
- (java.text Normalizer))
+ (:import (java.util.regex PatternSyntaxException))
(:require [clojure.string :as str]
[clojure.set :as set]
[clojure.java.io :as io]
@@ -823,13 +822,16 @@
;; Mac OS X 10.6.8, the \p{InFoo} syntax in a regular
;; expression appears to correspond almost exactly with the
;; Block specifications from Blocks.txt in Unicode 4.1.0. The
- ;; only difference is that there are a few Block names in this
- ;; Blocks.txt file that are not supported by this JVM.
+ ;; only difference is that there are a few Block names in that
+ ;; version of the Blocks.txt file that are not supported by
+ ;; this JVM.
- ;; The Scripts.txt file above specifies script names, many of
- ;; which correspond with Block names, but specify very
+ ;; The Scripts.txt files above specifies script names, many of
+ ;; which have the same name as Block names, but specify very
;; different sets of characters.
- "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
+ "http://unicode.org/Public/4.1.0/ucd/Blocks.txt"
+ ;; local copy on my machine:
+ ;; "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
out-fname "unicode-property-names-test-out.txt"
num-all-cps (count (all-codepoints))]
@@ -925,9 +927,9 @@
(printf "Regex %s is NOT legal\n" re-string)))))))))))
-(deftest ^:write-nfc-nfd-to-file
- write-nfc-nfd-to-file
- (let [fname "nfc-nfd-data.txt"]
+(deftest ^:write-normalized-forms-to-file
+ write-normalized-forms-to-file
+ (let [fname "normalized-form-data.txt"]
(with-open [f (io/writer fname :encoding "UTF-8")]
(binding [*out* f]
(print-interesting-jvm-version-properties)
@@ -935,24 +937,49 @@
(let [normalized-forms
(->> (all-codepoints)
(map (fn [i] {:cp i :str (chr i)}))
- (map (fn [m]
- (assoc m
- :nfc (Normalizer/normalize (:str m)
- java.text.Normalizer$Form/NFC)
- :nfd (Normalizer/normalize (:str m)
- java.text.Normalizer$Form/NFD)))))]
- (printf "hex-codepoint;string S, containing that code point and nothing else;max # of codepoints in either NFC or NFD of S;hex-codepoints of NFC(S), if different from S, otherwise empty;NFC(S);hex-codepoints of NFD(S), if different from S, otherwise empty;NFD(S)\n")
+ (map (fn [m] (assoc m
+ :nfc (NFC (:str m))
+ :nfd (NFD (:str m))
+ :nfkc (NFKC (:str m))
+ :nfkd (NFKD (:str m))))))]
+ (printf "hex-codepoint
+;string S, containing that code point and nothing else
+;max # of codepoints in either NFC or NFD of S
+;hex-codepoints of NFC(S), if different from S, otherwise empty
+;NFC(S)
+;hex-codepoints of NFD(S), if different from S, otherwise empty
+;NFD(S)
+;max # of codepoints in either NFKC or NFKD of S
+;hex-codepoints of NFKC(S), if different from S, otherwise empty
+;NFKC(S)
+;hex-codepoints of NFKD(S), if different from S, otherwise empty
+;NFKD(S)
+
+")
(doseq [m normalized-forms]
- (when (not (= (:str m) (:nfc m) (:nfd m)))
- (printf "%06X;%s;%d;%s;%s;%s;%s\n" (:cp m)
- (:str m)
- (max (cp-count (:nfc m)) (cp-count (:nfd m)))
- (if (= (:str m) (:nfc m))
- "" (hex-codepoint-str (:nfc m)))
- (if (= (:str m) (:nfc m))
- "" (:nfc m))
- (if (= (:str m) (:nfd m))
- "" (hex-codepoint-str (:nfd m)))
- (if (= (:str m) (:nfd m))
- "" (:nfd m))
- ))))))))
+ (when (not (= (:str m) (:nfc m) (:nfd m) (:nfkc m) (:nfkd m)))
+ (printf "%s"
+ (str (format "%06X" (:cp m))
+ (format ";%s" (:str m))
+ (format ";%d" (max (cp-count (:nfc m)) (cp-count (:nfd m))))
+ (format ";%s" (if (= (:str m) (:nfc m))
+ "" (hex-codepoint-str (:nfc m))))
+ (format ";%s" (if (= (:str m) (:nfc m))
+ "" (:nfc m)))
+ (format ";%s" (if (= (:str m) (:nfd m))
+ "" (hex-codepoint-str (:nfd m))))
+ (format ";%s" (if (= (:str m) (:nfd m))
+ "" (:nfd m)))
+
+ (format ";%d" (max (cp-count (:nfkc m)) (cp-count (:nfkd m))))
+ (format ";%s" (if (= (:str m) (:nfkc m))
+ "" (hex-codepoint-str (:nfkc m))))
+ (format ";%s" (if (= (:str m) (:nfkc m))
+ "" (:nfkc m)))
+ (format ";%s" (if (= (:str m) (:nfkd m))
+ "" (hex-codepoint-str (:nfkd m))))
+ (format ";%s" (if (= (:str m) (:nfkd m))
+ "" (:nfkd m)))
+ "\n")
+
+ ))))))))
Please sign in to comment.
Something went wrong with that request. Please try again.