Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Lots of coding to verify what I read in the manual. Sigh. At least it…

… is tested!
  • Loading branch information...
commit 97bce832af1aee09c546235349e2b74686756bf5 1 parent 8d826a8
@jafingerhut authored
Showing with 66 additions and 16 deletions.
  1. +66 −16 test/com/fingerhutpress/text/unicode/test.clj
View
82 test/com/fingerhutpress/text/unicode/test.clj
@@ -4,6 +4,7 @@
(:import (java.util.regex PatternSyntaxException)
(java.text Normalizer))
(:require [clojure.string :as str]
+ [clojure.set :as set]
[clojure.java.io :as io]
[clojure.pprint :as p]))
@@ -161,6 +162,13 @@
(str/join " " (map #(format "%X" (int %)) s))))
+(defn hex-cp
+ [cp]
+ (if (bmp-codepoint? cp)
+ (format "%04X" cp)
+ (format "%06X" cp)))
+
+
(defn hex-codepoint-str
"Take string s and return a string consisting of s's Unicode code points,
shown as hexadecimal numbers, separated by spaces. Useful for
@@ -800,18 +808,36 @@
(deftest ^:test-unicode-property-names test-unicode-property-names
(let [in-fname
- "http://unicode.org/Public/4.0-Update/Scripts-4.0.0.txt"
+; "http://unicode.org/Public/4.0-Update/Scripts-4.0.0.txt"
; "http://unicode.org/Public/4.1.0/ucd/Scripts.txt"
; "http://unicode.org/Public/5.0.0/ucd/Scripts.txt"
; "http://unicode.org/Public/5.1.0/ucd/Scripts.txt"
; "http://unicode.org/Public/5.2.0/ucd/Scripts.txt"
; "http://unicode.org/Public/6.0.0/ucd/Scripts.txt"
; "http://unicode.org/Public/6.1.0/ucd/Scripts-6.1.0d13.txt"
-; "/Users/andy/clj/UNIDATA/UCD/Scripts.txt"
- out-fname "unicode-property-names-test-out.txt"]
+
+; "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Scripts.txt"
+; "/Users/andy/clj/www.unicode.org/Public/zipped/6.0.0/UCD/Scripts.txt"
+
+ ;; At least with Hotspot Java 1.6.0_29 provided by Apple with
+ ;; Mac OS X 10.6.8, the \p{InFoo} syntax in a regular
+ ;; expression appears to correspond almost exactly with the
+ ;; Block specifications from Blocks.txt in Unicode 4.1.0. The
+ ;; only difference is that there are a few Block names in this
+ ;; Blocks.txt file that are not supported by this JVM.
+
+ ;; The Scripts.txt file above specifies script names, many of
+ ;; which correspond with Block names, but specify very
+ ;; different sets of characters.
+ "/Users/andy/clj/www.unicode.org/Public/zipped/4.1.0/UCD/Blocks.txt"
+
+ out-fname "unicode-property-names-test-out.txt"
+ num-all-cps (count (all-codepoints))]
(with-open [rdr (io/reader in-fname)
wr (io/writer out-fname :encoding "UTF-8")]
(binding [*out* wr]
+ (print-interesting-jvm-version-properties)
+ (printf "\n")
(let [script-map
(->> (line-seq rdr)
;; Assign line numbers
@@ -826,11 +852,13 @@
(re-find #"(?x)
^ \s*
([0-9a-fA-F]+) # first code point
- (?:\.+([0-9a-fA-F]+))? # last code point
- # of a range
+ (?:[.;\ ]+([0-9a-fA-F]+))? # last code
+ # point of a range
\s* ; \s* # field separator
- (\S+) # script name
- "
+ ([^\#]+) # script name
+ \s*
+ (?:\#.*)? # optional comment
+ $"
line)]
;; add more fields to this line's map
(merge m {:first-cp (Integer/parseInt first-cp 16),
@@ -842,23 +870,45 @@
(printf "Unrecognized format on line %d: %s\n" (:line-num m) line)
m))))
(group-by :script-name))]
- (doseq [script-name-prefix-in-re-pattern ["" "Is" "Script=" "In"]]
+ (doseq [script-name-prefix-in-re-pattern ["" "Is" "Block=" "In"]]
(printf "\n\nTry prefix \"%s\" before script name in regex pattern:\n"
script-name-prefix-in-re-pattern)
(doseq [script-name (sort (keys script-map))]
(when script-name
(let [re-string (str "^\\p{" script-name-prefix-in-re-pattern
- script-name "}$")]
+ (str/replace script-name #" " "") "}$")]
(if-let [pat (legal-pattern re-string)]
(do
(printf "Regex %s is legal\n" re-string)
- (let [cps (mapcat (fn [{:keys [first-cp last-cp]}]
- (range first-cp (inc last-cp)))
- (script-map script-name))
- non-matching-cps (remove #(re-find pat (chr %)) cps)]
- (printf " %d / %d code points match, of those that should\n"
- (- (count cps) (count non-matching-cps))
- (count cps))
+ (let [should-match-cps
+ (set (mapcat (fn [{:keys [first-cp last-cp]}]
+ (range first-cp (inc last-cp)))
+ (script-map script-name)))
+ do-match-cps (set (filter #(re-find pat (chr %))
+ (all-codepoints)))
+ good-match-cps (set/intersection should-match-cps
+ do-match-cps)
+ should-match-but-dont-cps
+ (set/difference should-match-cps good-match-cps)
+
+ shouldnt-match-but-do-cps
+ (set/difference do-match-cps good-match-cps)
+ ]
+ (printf " %d should match. %d do, %d do not. %d that should not match, do\n"
+ (count should-match-cps)
+ (count good-match-cps)
+ (count should-match-but-dont-cps)
+ (count shouldnt-match-but-do-cps))
+ (when (not= 0 (count should-match-but-dont-cps))
+ (printf " Should match, but do not: %s\n"
+ (str/join " "
+ (map hex-cp
+ (sort should-match-but-dont-cps)))))
+ (when (not= 0 (count shouldnt-match-but-do-cps))
+ (printf " Should not match, but do: %s\n"
+ (str/join " "
+ (map hex-cp
+ (sort shouldnt-match-but-do-cps)))))
; (printf " Code points that should match, but don't:\n")
; (printf " %s\n"
; (str/join "\n " (map #(format "%06X" %)
Please sign in to comment.
Something went wrong with that request. Please try again.