Skip to content

Commit

Permalink
Fix collation.
Browse files Browse the repository at this point in the history
  • Loading branch information
arcusfelis committed Jun 20, 2012
1 parent 12becd5 commit b898117
Show file tree
Hide file tree
Showing 17 changed files with 217 additions and 2,711 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -8,6 +8,7 @@ __License__: [LGPLv3](http://http://www.gnu.org/licenses/lgpl-3.0.html)
__License__: [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0.html)

__Author__: Uvarov Michael ([`freeakk@gmail.com`](mailto:freeakk@gmail.com))
__Unidata version__: 6.1.0

[Read edoc documentation](https://github.com/freeakk/ux/blob/master/doc/README.md)

Expand Down
1,255 changes: 0 additions & 1,255 deletions priv/UNIDATA/auxiliary/GraphemeBreakProperty.txt

This file was deleted.

Binary file not shown.
361 changes: 0 additions & 361 deletions priv/UNIDATA/auxiliary/GraphemeBreakTest.txt

This file was deleted.

Binary file added priv/UNIDATA/auxiliary/GraphemeBreakTest.txt.gz
Binary file not shown.
1,015 changes: 0 additions & 1,015 deletions priv/UNIDATA/auxiliary/WordBreakProperty.txt

This file was deleted.

Binary file added priv/UNIDATA/auxiliary/WordBreakProperty.txt.gz
Binary file not shown.
Binary file added priv/UNIDATA/auxiliary/WordBreakTest.txt.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion rebar.config
@@ -1,7 +1,7 @@
%% -*- erlang -*-
{erl_opts, [
% {d, 'UNIDATA_DEBUG'},
% {d, 'SLOW_TESTS'},
{d, 'SLOW_TESTS'},
debug_info]}.
{deps, [
%% We needs the reloader from Mochiweb
Expand Down
8 changes: 4 additions & 4 deletions src/uca/ux_uca.hrl
Expand Up @@ -60,11 +60,11 @@
% Hangul & UCA
-define(COL_HANGUL_LBASE, 12603).
-define(COL_HANGUL_VBASE, 12729).
-define(COL_HANGUL_TBASE, 12799).
-define(COL_HANGUL_TBASE, 12799). % 31FF

-define(COL_HANGUL_LLAST, ?COL_HANGUL_LBASE + ?HANGUL_LCOUNT).
-define(COL_HANGUL_VLAST, ?COL_HANGUL_VBASE + ?HANGUL_VCOUNT).
-define(COL_HANGUL_TLAST, ?COL_HANGUL_TBASE + ?HANGUL_TCOUNT).
-define(COL_HANGUL_LLAST, (?COL_HANGUL_LBASE + ?HANGUL_LCOUNT)).
-define(COL_HANGUL_VLAST, (?COL_HANGUL_VBASE + ?HANGUL_VCOUNT)).
-define(COL_HANGUL_TLAST, 12850).

% Weight on level 1 (L1) is L1 of Hangul jamo L.
-define(IS_L1_OF_HANGUL_L(W), (
Expand Down
18 changes: 8 additions & 10 deletions src/uca/ux_uca_extract.erl
Expand Up @@ -419,16 +419,14 @@ do_extract1([]=_S, _MFn, _Key, _OldCCC, Skipped, Res)
% Range 3: Ideographic AND NOT Unified_Ideograph
% -----------------------------------------------------------------------------
do_implicit(H)
when ?CHAR_IS_UNIFIED_IDEOGRAPH(H)
and (?CHAR_IS_CJK_COMPATIBILITY_IDEOGRAPH(H)
or ?CHAR_IS_CJK_UNIFIED_IDEOGRAPH(H)) ->
implicit_weight(H, 16#FB40);

do_implicit(H)
when ?CHAR_IS_UNIFIED_IDEOGRAPH(H)
and (not (?CHAR_IS_CJK_COMPATIBILITY_IDEOGRAPH(H)
or ?CHAR_IS_CJK_UNIFIED_IDEOGRAPH(H))) ->
implicit_weight(H, 16#FB80);
when ?CHAR_IS_UNIFIED_IDEOGRAPH(H) ->
if
(?CHAR_IS_CJK_COMPATIBILITY_IDEOGRAPH(H)
or ?CHAR_IS_CJK_UNIFIED_IDEOGRAPH(H)) ->
implicit_weight(H, 16#FB40);
true ->
implicit_weight(H, 16#FB80)
end;

do_implicit(H) ->
implicit_weight(H, 16#FBC0).
Expand Down
22 changes: 22 additions & 0 deletions src/uca/ux_uca_utils.erl
Expand Up @@ -9,8 +9,30 @@
split_levels/3,
get_reassign_function/2]).

%% For debugging only
-export([hangul_type/1,
implicit_type/1]).

-include("ux.hrl").
-include("ux_uca.hrl").


hangul_type(X) when ?IS_L1_OF_HANGUL_L(X) -> l;
hangul_type(X) when ?IS_L1_OF_HANGUL_V(X) -> v;
hangul_type(X) when ?IS_L1_OF_HANGUL_T(X) -> t;
hangul_type(_) -> x.


implicit_type(X) when ?CHAR_IS_UNIFIED_IDEOGRAPH(X) ->
if (?CHAR_IS_CJK_COMPATIBILITY_IDEOGRAPH(X)
or ?CHAR_IS_CJK_UNIFIED_IDEOGRAPH(X)) ->
base1;
true ->
base2
end;
implicit_type(_) ->
base3.

%%
%% Helpers
%%
Expand Down
7 changes: 6 additions & 1 deletion src/unidata/ux_unidata_parser.erl
Expand Up @@ -50,7 +50,12 @@ check({FileType, DataTypes, FileName}) ->

check_filename(FileName) ->
% File exists?
{ok, _Info} = file:read_file_info(FileName),
case file:read_file_info(FileName) of
{ok, _Info} -> ok;
Error ->
error_logger:error_msg(?MODULE_STRING ++ ": File ~s not found.", [FileName]),
erlang:error(Error)
end,
ok.

file_format(Mod) ->
Expand Down
92 changes: 51 additions & 41 deletions src/ux.hrl
Expand Up @@ -52,10 +52,10 @@
-define(HANGUL_NCOUNT, 588).
-define(HANGUL_SCOUNT, 11172).

-define(HANGUL_SLAST, ?HANGUL_SBASE + ?HANGUL_SCOUNT).
-define(HANGUL_LLAST, ?HANGUL_LBASE + ?HANGUL_LCOUNT).
-define(HANGUL_VLAST, ?HANGUL_VBASE + ?HANGUL_VCOUNT).
-define(HANGUL_TLAST, ?HANGUL_TBASE + ?HANGUL_TCOUNT).
-define(HANGUL_SLAST, (?HANGUL_SBASE + ?HANGUL_SCOUNT)).
-define(HANGUL_LLAST, (?HANGUL_LBASE + ?HANGUL_LCOUNT)).
-define(HANGUL_VLAST, (?HANGUL_VBASE + ?HANGUL_VCOUNT)).
-define(HANGUL_TLAST, (?HANGUL_TBASE + ?HANGUL_TCOUNT)).

-define(CHAR_IS_HANGUL_L(Ch), (
(Ch>=?HANGUL_LBASE) and (Ch=<?HANGUL_LLAST)
Expand All @@ -73,53 +73,63 @@




-define(CHECK_RANGE(X, A, B), (((X) >= (A)) and ((X) =< (B)))).
-define(CHECK_VALUE(X, A), (((X) =:= (A)))).

% CJK_Unified_Ideograph and CJK_Compatibility_Ideographs from
% http://www.unicode.org/Public/UNIDATA/Blocks.txt
%
% grep "CJK Unified Ideograph" priv/UNIDATA/Blocks.txt
% 3400..4DBF; CJK Unified Ideographs Extension A
% 4E00..9FFF; CJK Unified Ideographs
% 20000..2A6DF; CJK Unified Ideographs Extension B
% 2A700..2B73F; CJK Unified Ideographs Extension C
% 2B740..2B81F; CJK Unified Ideographs Extension D
-define(CHAR_IS_CJK_UNIFIED_IDEOGRAPH(Ch), (
(Ch >= 16#4E00) and (Ch =< 16#9FFF) % CJK Unified Ideographs
?CHECK_RANGE(Ch, 16#4E00, 16#9FFF)
% or ?CHECK_RANGE(Ch, 16#3400, 16#4DBF)
% or ?CHECK_RANGE(Ch, 16#20000, 16#2A6DF)
% or ?CHECK_RANGE(Ch, 16#2A700, 16#2B73F)
% or ?CHECK_RANGE(Ch, 16#2B740, 16#2B81F)
)).

% grep "CJK Compatibility Ideograph" priv/UNIDATA/Blocks.txt
% F900..FAFF; CJK Compatibility Ideographs
% 2F800..2FA1F; CJK Compatibility Ideographs Supplement
-define(CHAR_IS_CJK_COMPATIBILITY_IDEOGRAPH(Ch), (
(Ch >= 16#F900) and (Ch =< 16#FAFF) % CJK Compatibility Ideographs
?CHECK_RANGE(Ch, 16#F900, 16#FAFF)
% or ?CHECK_RANGE(Ch, 16#2F800, 16#2FA1F)
)).

% Unified_Ideograph from http://unicode.org/Public/UNIDATA/PropList.txt
% grep Unified PropList.txt
% 3400..4DB5 ; Unified_Ideograph # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5
% 4E00..9FCC ; Unified_Ideograph # Lo [20941] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FCC
% FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F
% FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11
% FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14
% FA1F ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA1F
% FA21 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA21
% FA23..FA24 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPATIBILITY IDEOGRAPH-FA24
% FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPATIBILITY IDEOGRAPH-FA29
% 20000..2A6D6 ; Unified_Ideograph # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
% 2A700..2B734 ; Unified_Ideograph # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
% 2B740..2B81D ; Unified_Ideograph # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
-define(CHAR_IS_UNIFIED_IDEOGRAPH(Ch), (
% [6582] CJK UNIFIED IDEOGRAPH-3400..4DB5
((Ch >= 16#3400) and (Ch =< 16#4DB5))

% [20940] CJK UNIFIED IDEOGRAPH-4E00..9FCB
or ((Ch >= 16#4E00) and (Ch =< 16#9FCB))
% FIXED: Error: [55296,33] lower [40908,98]
% CJK Unified Ideographs
%or ((Ch >= 16#4E00) and (Ch =< 16#9FFF))

% [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..FA0F
or ((Ch >= 16#FA0E) and (Ch =< 16#FA0F))

or ((Ch == 16#FA11) ) % CJK COMPATIBILITY IDEOGRAPH-FA11

% [2] CJK COMPATIBILITY IDEOGRAPH-FA13..FA14
or ((Ch >= 16#FA13) and (Ch =< 16#FA14))

or ((Ch == 16#FA1F) ) % CJK COMPATIBILITY IDEOGRAPH-FA1F
or ((Ch == 16#FA21) ) % CJK COMPATIBILITY IDEOGRAPH-FA21

% [2] CJK COMPATIBILITY IDEOGRAPH-FA23..FA24
or ((Ch >= 16#FA23) and (Ch =< 16#FA24))

% [3] CJK COMPATIBILITY IDEOGRAPH-FA27..FA29
or ((Ch >= 16#FA27) and (Ch =< 16#FA29))

% [42711] CJK UNIFIED IDEOGRAPH-20000..2A6D6
or ((Ch >= 16#20000) and (Ch =< 16#2A6D6))

% [4149] CJK UNIFIED IDEOGRAPH-2A700..2B734
or ((Ch >= 16#2A700) and (Ch =< 16#2B734))

% [222] CJK UNIFIED IDEOGRAPH-2B740..2B81D
or ((Ch >= 16#2B740) and (Ch =< 16#2B81D))
?CHECK_RANGE(Ch, 16#3400, 16#4DB5)
or ?CHECK_RANGE(Ch, 16#4E00, 16#9FCC)
or ?CHECK_RANGE(Ch, 16#FA0E, 16#FA0F)
or ?CHECK_VALUE(Ch, 16#FA11)
or ?CHECK_VALUE(Ch, 16#FA13)
or ?CHECK_VALUE(Ch, 16#FA14)
or ?CHECK_VALUE(Ch, 16#FA1F)
or ?CHECK_VALUE(Ch, 16#FA21)
or ?CHECK_VALUE(Ch, 16#FA23)
or ?CHECK_VALUE(Ch, 16#FA24)
or ?CHECK_RANGE(Ch, 16#FA27, 16#FA29)
or ?CHECK_RANGE(Ch, 16#20000, 16#2A6D6)
or ?CHECK_RANGE(Ch, 16#2A700, 16#2B734)
or ?CHECK_RANGE(Ch, 16#2B740, 16#2B81D)
)).


Expand Down
3 changes: 2 additions & 1 deletion src/ux_uca.erl
Expand Up @@ -258,7 +258,8 @@ check_const_test_() ->
,?_assertEqual(l1([?HANGUL_LBASE]), [?COL_HANGUL_LBASE])
,?_assertEqual(l1([?HANGUL_VBASE]), [?COL_HANGUL_VBASE])
,?_assertEqual(l1([?HANGUL_TBASE]), [?COL_HANGUL_TBASE])
]
,?_assertEqual(l1([?HANGUL_TLAST]), [?COL_HANGUL_TLAST])
]
end,
{timeout, 60,
{setup, fun() -> l1("0") end, Cases}}.
Expand Down
8 changes: 4 additions & 4 deletions src/ux_unidata.erl
Expand Up @@ -99,9 +99,9 @@ get_source_file('norm_props') ->
get_source_file('unidata') ->
get_dir('ucd') ++ "/UnicodeData.txt.gz";
get_source_file('grapheme_break_property') ->
get_dir('ucd') ++ "/auxiliary/GraphemeBreakProperty.txt";
get_dir('ucd') ++ "/auxiliary/GraphemeBreakProperty.txt.gz";
get_source_file('word_break_property') ->
get_dir('ucd') ++ "/auxiliary/WordBreakProperty.txt".
get_dir('ucd') ++ "/auxiliary/WordBreakProperty.txt.gz".



Expand Down Expand Up @@ -130,9 +130,9 @@ get_test_file('collation_test_non_ignorable') ->


get_test_file('grapheme_break_test') ->
get_dir('ucd') ++ "/auxiliary/GraphemeBreakTest.txt";
get_dir('ucd') ++ "/auxiliary/GraphemeBreakTest.txt.gz";
get_test_file('word_break_test') ->
get_dir('ucd') ++ "/auxiliary/WordBreakTest.txt".
get_dir('ucd') ++ "/auxiliary/WordBreakTest.txt.gz".


open_test_file(Id) ->
Expand Down

0 comments on commit b898117

Please sign in to comment.