forked from infochimps-data/infochimps-data
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
torture test now includes exemplar non-UTF8 and non-unicode sequences
- Loading branch information
Philip (flip) Kromer
committed
Mar 5, 2013
1 parent
b90c532
commit 96cc6fc
Showing
2 changed files
with
100 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,110 +1,141 @@ | ||
#!/usr/bin/env ruby | ||
# -*- coding: utf-8 -*- | ||
|
||
# | ||
# Generates a file to test string processing and unicode edge cases. The file | ||
# contains the raw text and encodings guaranteed to have only keyboard | ||
# characters (that is, ones with ASCII representation `0x20` to `0x7e`). | ||
# | ||
# Output is valid UTF-8 text with the following columns: | ||
# | ||
# 1. description of the string (ex: `'Internationalization', psedolocalized`) | ||
# 2. string length in characters (ex: `20`) | ||
# 3. string size as UTF-8 in bytes (ex: `27`) | ||
# 4. the raw string, high-byte characters and all (`Iñtërnâtiônàlizætiøn`) | ||
# 5. a `0` if the raw string is omitted (as it must if it has newlines, nulls or tabs) | ||
# 6. The json-encoded string, with all non-keyboard characters in JSON-escaped form (ex: `"I\u00f1t\u00ebrn\u00e2ti\u00f4n\u00e0liz\u00e6ti\u00f8n"` ) | ||
# 7. a "\u"-escaped string -- all non-keyboard characters are replaced with | ||
# `\uabcd`, where `abcd` is the 4-digit unicode code point. | ||
# 8. a "\x"-escaped string -- all non-keyboard characters (outside `0x20-0x7e`) are replaced with | ||
# `\xab`, where `ab` are the 2-digit hex bytes of its UTF-8 encoding | ||
# 9. a comma-separated list of decimals for every byte (ex: `73,241,116,235,114,110,...`) | ||
#10. a comma-separated list of unicode code points for every character (ex: `0049,00f1,0074,00eb,0072,006e,00e2,0074,0069,...`) | ||
# | ||
|
||
Encoding.default_external = Encoding.default_internal = Encoding::UTF_8 | ||
require 'digest/md5' | ||
require 'gorillib/base' | ||
require 'gorillib/pathname' | ||
require 'gorillib/logger/log' | ||
|
||
require_relative '../../../examples/munging/wikipedia/utils/munging_utils' | ||
include MungingUtils | ||
require_relative '../../../lib/wu/munging' | ||
include Wu::Munging::Utils | ||
|
||
PREAMBLE = <<-'EOS' | ||
|
||
This file holds string processing and unicode edge cases. It displays the raw | ||
text along with encodings that use only keyboard characters (that is, ones | ||
with ASCII representation `0x20` to `0x7e`). | ||
|
||
Many strings are marked as not "safe" -- they are not valid UTF-8, or not valid Unicode, or contain nulls or tabs. Only the byte-level encodings are given. | ||
|
||
It is valid UTF-8 text with the following columns: | ||
|
||
1. description of the string (ex: `'Internationalization', psedolocalized`) | ||
2. string length in characters (ex: `20`) | ||
3. string size as UTF-8 in bytes (ex: `27`) | ||
4. If UTF-8 and unicode safe, a `1`; `0` otherwise | ||
5. the raw string, high-byte characters and all (`Iñtërnâtiônàlizætiøn`) | ||
6. readable bytes, "\x"-escaped: all bytes outside `0x20-0x7e` are replaced with `\xab`, where `ab` is the byte's 2-digit hexadecimal representation | ||
7. all bytes, as comma-separated list of decimals (ex: `73,195,177,116,195,171,114,110,,...`) | ||
8. The json-encoded string, with all non-keyboard characters in JSON-escaped form (ex: `"I\u00f1t\u00ebrn\u00e2ti\u00f4n\u00e0liz\u00e6ti\u00f8n"` ) | ||
9. readable characters, "\u"-escaped: all non-keyboard characters are replaced with `\uabcd`, where `abcd` is the 4-digit unicode code point. | ||
10. all characters, as decimals, comma-separated (ex: `73,241,116,235,114,110,...`) | ||
11. all characters, as hexadecimal unicode code points, comma-separated (ex: `0049,00f1,0074,00eb,0072,006e,00e2,0074,0069,...`) | ||
Items 5 and 8-11 are left blank if the string is not safe; 5 (the raw string) is also omitted if it contains control characters (newline, tab) | ||
EOS | ||
def variants(desc, str, safe) | ||
arr = [ | ||
"%-40s" % desc, # 0 description | ||
str.length, # 1 number of characters | ||
str.bytesize, # 2 number of bytes | ||
(safe ? 1 : 0), # 3 a `1` if string is safe | ||
(safe ? str : ''), # 4 naked string | ||
x_escape(str), # 5 readable binary-escaped | ||
str.bytes.to_a.join(','), # 6 all bytes (decimal) | ||
] | ||
if safe | ||
arr += [ | ||
'['+safe_json_encode(str)+']', # 7 json-encoded, all non-ascii escaped | ||
u_escape(str), # 8 readable unicode-escaped | ||
str.chars.map{|ch| "%04x" % ch.ord }.join(','), # 9 ordinal of each char (hex) | ||
str.chars.map{|ch| ch.ord }.join(','), # 10 ordinal of each char (decimal) | ||
] | ||
else | ||
arr += [nil, nil, nil, nil] | ||
end | ||
arr[4] = '' if desc =~ /^whitespace/ | ||
arr | ||
end | ||
|
||
# *************************************************************************** | ||
|
||
TEST_STRINGS = [ | ||
["blank string", ""], | ||
["Normal string", "hello, world"], | ||
["Unicode snowman", "hello, world. hello, snowman: \u2603",], | ||
["Normal string", "Hello, world"], | ||
["Unicode snowman", "Hello, snowman: \u2603",], | ||
["'Test', psedolocalized", "\u0164\u0117\u015f\u0167"], | ||
["'Internationalization', psedolocalized", "I\u00f1t\u00ebrn\u00e2ti\u00f4n\u00e0liz\u00e6ti\u00f8n"], | ||
["'Internationalization', psedolocalized", "I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xc3\xa0liz\xc3\xa6ti\xc3\xb8n"], | ||
["Greek 'kosme'", "\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"], | ||
["Hello, chimp (right to left)", "\u202e\u05e9\u05dc\u05d5\u05dd, \u0627\u0644\u0628\u0639\u0627\u0645 \u0634\u064a\u0645\u0628\u0627\u0646\u0632\u064a.\u202c"], | ||
["Left-to-right override", "\u202d\u05e9\u05dc\u05d5\u05dd.\u202c"], | ||
["Right-to-left override", "\u202eSDRAWKCAB FORWARDS.\u202c"], | ||
["Right-to-left embed", "\u202bSDRAWKCAB FORWARDS.\u202c"], | ||
["displays TM on right, dot on left", "\u202e\u05e9\u05dc\u05d5\u05dd \u202bBrawndo\u2122 has elecrolytes\u202c.\u202c"], | ||
["dot at end shows on left", "\u202b\u05e9\u05dc\u05d5\u05dd means peace.\u202c",], | ||
["right to left: Shalom, chimpanzee", "\u202e\u05e9\u05dc\u05d5\u05dd\u060c \u0634\u0645\u0628\u0627\u0646\u0632\u064a!\u202c",], | ||
["'shalom' with Left-to-right override", "\u202d\u05e9\u05dc\u05d5\u05dd.\u202c"], | ||
["'SDRAWKCAB MA I.' w/ right-to-left override", "\u202eSDRAWKCAB MA I.\u202c"], | ||
["'SDRAWKCAB MA I.' w/ right-to-left embed", "\u202bSDRAWKCAB MA I.\u202c"], | ||
["r-t-l: should have TM on right, dot on left", "\u202e\u05e9\u05dc\u05d5\u05dd \u202bBrawndo\u2122 has elecrolytes\u202c.\u202c"], | ||
["r-t-l: 'shalom.'; dot should show on left", "\u202b\u05e9\u05dc\u05d5\u05dd means peace.\u202c",], | ||
["characters have correct order", "The vowels \u03b1, \u03b5, \u03b7, \u03b9, \u03bf derive from \u05d0\u200e, \u05d4\u200e, \u05d7\u200e, \u05d9\u200e, \u05e2, resp."], | ||
["characters and commas reversed on right", "The vowels \u03b1, \u03b5, \u03b7, \u03b9, \u03bf derive from \u05d0\u200e, \u05d4, \u05d7, \u05d9, \u05e2, resp."], | ||
["null character", "null\x00char"], | ||
["control chars 0x01 .. 0x0f and 0x7f", [(1..15).to_a, 127].flatten.map{|ii| ii.chr }.join, false], | ||
["null character between 'l' and 'c'", "null\x00char"], | ||
["control chars 0x01..0x0f and 0x7f", [(1..15).to_a, 127].flatten.map{|ii| ii.chr }.join, false], | ||
["diacritic marks: precomposed", "\u010f\u0131\u0310\u00e0\ufb4d\u1e5b\u1e2d\u0163\u0623\u045c \u0a33a\u0306\ufb3a\u1e35\u015f"], | ||
["diacritic marks: combining", "d\u030ci\u0310a\u0300\u05db\u05bfr\u0323i\u0330t\u0327\u0627\u0654\u043a\u0301 \u0a32\u0a3c\u0103\u05da\u05bck\u0331s\u0327"], | ||
["zero-width joiners", "\u0915\u094d \u0915\u094d\u200d \u0915\u094d\u0937 \u0915\u094d\u200d\u0937"], | ||
["zero-width non joiners", "Devanagari \u0915\u094d and \u0937 typically form \u0915\u094d\u0937, but with a ZWNJ they show \u0915\u094d\u200c\u0937."], | ||
|
||
["whitespace: ascii yes", "\u0009\u000a\u000b\u000c\u000d\u0020", false], | ||
["whitespace: posix yes", "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000", false], | ||
["quotation marks", "\" \' \u00ab \u00bb \u2018 \u2019 \u201a \u201b \u201c \u201d \u201e \u201f \u2039 \u203a\u300c\u300d\u300e\u300f\u301d\u301e\u301f\ufe41 \ufe42 \ufe43 \ufe44\uff02\uff07\uff62\uff63 ",], | ||
# | ||
["whitespace (\\s) in ascii", "\u0009\u000a\u000b\u000c\u000d\u0020"], | ||
["whitespace (\\s) in posix unicode regex", "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"], | ||
["quotation marks", "\"\'\u00ab\u00bb\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u2039\u203a\u300c\u300d\u300e\u300f\u301d\u301e\u301f\ufe41\ufe42\ufe43\ufe44\uff02\uff07\uff62\uff63",], | ||
["punctuation", "\u2019\'[](){}\u27e8\u27e9:,\u060c\u3001\u2014\u2015\u2026,...,...!.\u00ab\u00bb\u2010-?\u2018\u2019,\u201c\u201d,'',"";/\u2044\u00b7&*@\u2022^\u2020,\u2021\u00b0\u3003\u00a1\u00bf#\u2116\u00f7\u00ba,\u00aa%,\u2030\u2031\u00b6\u2032,\u2033,\u2034\u00a7~_\u00a6|\u00a9\u00ae\u2120\u2117\u2122\u00a4\u2042\u261e\u203d\u061f\u25ca\u203b\u2040",], | ||
# # From PHP Manual (CC-BY) | ||
['Valid 2 Octet Sequence', "\xc3\xb1", false, ], | ||
['Invalid 2 Octet Sequence', "\xc3\x28", false, ], | ||
['Invalid Sequence Identifier', "\xa0\xa1", false, ], | ||
['Valid 3 Octet Sequence', "\xe2\x82\xa1", false, ], | ||
['Invalid 3 Octet Sequence (in 2nd Octet)', "\xe2\x28\xa1", false, ], | ||
['Invalid 3 Octet Sequence (in 3rd Octet)', "\xe2\x82\x28", false, ], | ||
['Valid 4 Octet Sequence', "\xf0\x90\x8c\xbc", false, ], | ||
['Invalid 4 Octet Sequence (in 2nd Octet)', "\xf0\x28\x8c\xbc", false, ], | ||
['Invalid 4 Octet Sequence (in 3rd Octet)', "\xf0\x90\x28\xbc", false, ], | ||
['Invalid 4 Octet Sequence (in 4th Octet)', "\xf0\x28\x8c\x28", false, ], | ||
['Valid 5 Octet Sequence (but not Unicode!)', "\xf8\xa1\xa1\xa1\xa1", false, ], | ||
['Valid 6 Octet Sequence (but not Unicode!)', "\xfc\xa1\xa1\xa1\xa1\xa1", false, ], | ||
] | ||
|
||
|
||
def u_escape(str) str.gsub(/\\/, '\\\\').gsub(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } ; end | ||
def u_escape(str) | ||
str.gsub(/\\/, '\\\\').gsub(NON_PLAIN_ASCII_RE){|ch| "\\u%04x" % ch.ord } | ||
end | ||
|
||
def x_escape(str) | ||
str.gsub(NON_PLAIN_ASCII_RE){|ch| ch.unpack('C*').map{|ii| "\\x%02x" % ii }.join } | ||
str.bytes.map{|byte| (32..126).include?(byte) ? byte.chr : "\\x%02x" % byte }.join | ||
end | ||
|
||
def variants(desc, str, show_raw) | ||
[ "%-40s" % desc, # description | ||
str.length, | ||
str.bytesize, | ||
(show_raw ? str : ''), # naked string | ||
show_raw ? 1 : 0, # 1 if string is showable | ||
safe_json_encode(str), # encoded as JSON | ||
u_escape(str), | ||
x_escape(str), | ||
str.chars.map{|ch| "%04x" % ch.ord }.join(','), # ordinal of each character (hex) | ||
str.chars.map{|ch| ch.ord }.join(','), # ordinal of each character (decimal) | ||
] | ||
end | ||
def test_variants(str, desc, len, bytesize, utf8p, _, xstr, bytes_dec, safe_json, ustr, chars_hex, chars_dec) | ||
safe = (utf8p == 1) | ||
u1 = '"' + ustr.gsub(/\"/, '\\\"') + '"' if safe | ||
x1 = '"' + ustr.gsub(/\"/, '\\\"') + '"' if safe | ||
# puts [u1, x1, eval(u1), eval(x1), str, ] | ||
|
||
def test_variants(str, desc, len, bytes, show_raw, _, safe_json, ustr, xstr, *args) | ||
u1 = '"' + ustr.gsub(/\"/, '\\\"') + '"' | ||
x1 = '"' + ustr.gsub(/\"/, '\\\"') + '"' | ||
# puts [u1, x1, str, eval(u1), eval(x1)] | ||
|
||
tests = [ | ||
len == str.length, | ||
bytes == str.bytesize, | ||
MultiJson.decode('['+safe_json+']').first == str, | ||
eval(u1) == str, | ||
eval(x1) == str, | ||
] | ||
tests = [] | ||
tests << (len == str.length) | ||
tests << (bytesize == str.bytesize) | ||
tests << (MultiJson.decode(safe_json).first == str) if safe_json.present? | ||
tests << (eval(u1) == str) if safe | ||
tests << (eval(x1) == str) if safe | ||
# | ||
tests[2] = true if (desc =~ /null character/) | ||
warn [desc, tests].flatten.join("\t") unless tests.all? | ||
warn [desc, tests, str, MultiJson.decode('['+safe_json+']').first, safe_json, u1, x1].flatten.join("\t") unless tests.all? | ||
rescue StandardError, SyntaxError => err | ||
puts err | ||
puts err, err.backtrace.first, xstr | ||
end | ||
|
||
File.open('string_handling_test.tsv', 'w', encoding: 'UTF-8') do |file| | ||
TEST_STRINGS.each do |desc, str, show_raw=true| | ||
vv = variants(desc, str, show_raw) | ||
TEST_STRINGS.each do |desc, str, safe=true| | ||
vv = variants(desc, str, safe) | ||
test_variants(str, *vv).inspect | ||
# file.puts "%-40s\t%-60s\t%-60s" % vv.values_at(0,4,5) | ||
file.puts vv.join("\t") | ||
end | ||
end |
Binary file not shown.