Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

874 lines (847 sloc) 27.799 kB
{CaseFolding.bra
Read CaseFolding-7.0.0.txt and convert to Full and Simple tables in C}
X=
(CaseFoldingFileName="CaseFolding-7.0.0.txt")
(UnicodeDataDate="10-03-2014 18:20:00")
(TurkicN=)
(SimpleN=)
(FullN=)
(CommonN=)
(TurkicMin=)
(SimpleMin=)
(UpperMin=)
(LowerMin=)
(COMBN=)
(COMBMin=)
(FullMin=)
(COMB=)
(Small2Capital=)
(Common=)
(Full=)
(Simple=)
(Turkic=)
(Upper=)
(Lower=)
( readFile
= file hexIn type hexOut comment 16210 off
. get$(!arg,STR):?file
& out$fileRead
& 10+-1*asc$A:?off
& 0
: ?(its.Full)
: ?(its.Simple)
: ?(its.Turkic)
: ?(its.Common)
& ( 16210
= n k
. 0:?nn
& whl
' ( @(!arg:%@?k ?arg)
& ( !k:>" "
& 16*!nn:?16n
& 16*!nn+(!k:#|!off+asc$!k)
: ?nn
|
)
)
& !nn
)
& whl
' ( @(!file:%?line \n ?file)
& ( @(!line:"#" ?)
| @(!line:?hexIn "; " ?type "; " ?hexOut "; #" ?comment)
& 16210$!hexIn:?decIn
& :?decOut
& whl
' ( @( !hexOut
: %?hex (" " ?hexOut|:?hexOut)
)
& !decOut 16210$!hex:?decOut
)
& ( !decout:% %
| @(!comment:? CAPITAL ?)
& (!decOut.!decIn.!comment)+!(its.Small2Capital)
: ?(its.Small2Capital)
|
)
& !type
: ( F
& (!decIn.!decOut.!comment)+!(its.Full)
: ?(its.Full)
| S
& (!decIn.!decOut.!comment)+!(its.Simple)
: ?(its.Simple)
| T
& (!decIn.!decOut.!comment)+!(its.Turkic)
: ?(its.Turkic)
| C
& (!decIn.!decOut.!comment)+!(its.Common)
: ?(its.Common)
| out$ONBEKEND !line&get'
)
|
)
)
)
( 2html
=
. ( writes
= L
. :?L
& whl
' ( !arg:%?n ?arg
& !L "&#" !n ";":?L
)
& !L
)
& ( writeList
= unfolded folded comm
. put$(str$("<h1>" !arg "</h1>\n"),"CaseFolding.html",APP)
& !(its.!arg):?arg
& str
$ ( "<table>"
( :?L
& whl
' ( !arg
: (?unfolded.?folded.?cap.?comm.?full)+?arg
& "<tr><td>&#"
!unfolded
";</td><td>&#"
!folded
";</td><td>"
writes$!cap
"</td><td>"
writes$!full
"</td><td>"
!comm
"</td></tr>"
\n
!L
: ?L
)
& !L
)
"</table>"
)
: ?L
& put$(!L,"CaseFolding.html",APP)
)
& put$("<html><body>
","CaseFolding.html",NEW)
& writeList$COMB
& put$("</body></html>","CaseFolding.html",APP)
)
( UpperLower
= upper lower
. 0:?upper:?lower
& ( filter
= A Z unfolded folded comm
. !(its.!arg):?arg
& ( shorter
= a b A B
. !arg:(?a.?b)
& den$(sim$(!a,)):?A
& den$(sim$(!b,)):?B
& !A+-1*!B:<0
)
& whl
' ( !arg:(?unfolded.%?folded ?.?comm)+?arg
& ( !lower:?A+(!folded.?k)+?Z
& ( shorter$(!comm.!k)
& !A+(!folded.!comm)+!Z:?lower
|
)
| (!folded.!comm)+!lower:?lower
)
& ( @(!comm:? SMALL ?)
& ( !lower:?+(!unfolded.?)+?
| (!unfolded.!comm)+!lower:?lower
)
| @(!comm:? CAPITAL ?)
& ( !upper:?+(!unfolded.?)+?
| (!unfolded.!comm)+!upper:?upper
)
|
)
)
)
& filter$Full
& filter$Simple
& filter$Turkic
& !upper:?(its.Upper)
& !lower:?(its.Lower)
)
( count
= n
. 0:?n
& whl'(!arg:%+?arg&1+!n:?n)
& !n
)
( 2List
= C L
. !arg
| :?L
& whl'(!arg:%?C+?arg&!C !L:?L)
& !L
)
( store
= min all rij n A B n nn occ
. (its.count)$!arg:?min
& 1:?occ
& div$(!min,!occ):?min
& whl
' ( !arg:?all
& new$hash:?rij
& whl
' ( !all:(?n.?)+?all
& mod$(!n,!min):?adr
& ( (rij..find)$!adr&~`
| (rij..insert)$(!adr.!n)
)
)
& (|(rij..forall)$out&get')
& !all:~0
& 1+!min:?min
& put$(str$(\r !min))
)
& out$(min !min)
& (!min.!rij)
)
( writeC
= all n N c rij Nfull Nsimple Nupper Nlower
. !(its.COMBMin):?arg
& out$writeC
& !(its.COMB):?all
& 0:?fulls
& whl
' ( !all:(?Unfolded.?Simple.?Capital.?Comment.?Full)+?all
& ( !Full:
| !fulls:?A+(!Full.?c)+?Z
& !A+(!Full.!c "*//*" !Comment)+!Z:?fulls
| (!Full.!Comment)+!fulls:?fulls
)
)
& !fulls:?+[?NFulls
& out$(NFulls !NFulls)
& put
$ ( str
$ ( "/*
LETTERFUNC - Unicode support
Copyright (C) 2012 Center for Sprogteknologi, University of Copenhagen
This file is part of LETTERFUNC.
LETTERFUNC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
LETTERFUNC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with LETTERFUNC; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef LETTER_H
#define LETTER_H
#define ARRSIZE "
!(its.COMBMin)
"
/*
Based on "
!(X.CaseFoldingFileName)
"
Structures created with CaseFolding.bra
*/
struct tri{unsigned int w[3];};
struct quat{unsigned int Unfolded;unsigned int Simple;unsigned int Capital;struct tri * Full;};
extern struct quat Letter[ARRSIZE];
/*
Based on UnicodeData.txt ("
!(X.UnicodeDataDate)
")
(Does not convert final sigma to non-final sigma)
Structures created with uni.bra
*/
struct ccaseconv {unsigned int L:21;int range:11;unsigned int inc:2;int dif:20;};
extern struct ccaseconv l2u[];
extern struct ccaseconv u2l[];
struct cletter {unsigned int L:21;unsigned int range:11;};
extern struct cletter Cletters[];
#endif
"
)
, "letter.h"
, NEW
)
& put
$ ( str
$ ( "/*
LETTERFUNC - Unicode support
Copyright (C) 2012 Center for Sprogteknologi, University of Copenhagen
This file is part of LETTERFUNC.
LETTERFUNC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
LETTERFUNC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with LETTERFUNC; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
// Based on "
!(X.CaseFoldingFileName)
"
#include \"letter.h\"
struct tri Full["
!NFulls
"]={\n"
)
, "letter.cpp"
, NEW
)
& !fulls:?all
& ", ":?sep
& 0:?i
& whl
' ( !all
: ( %@?w1
( %@?w2
(%@?w3|&0:?w3)
| &0:?w2:?w3
)
. ?Comment
)
+ ?all
& ( !i+1:!NFulls&"};":?sep
|
)
& put
$ ( str
$ ("{{" !w1 "," !w2 "," !w3 "}}" !sep "/*" !Comment "*/" \n)
, "letter.cpp"
, APP
)
& !i+1:?i
)
& !(its.COMB):?all
& 0:?rij
& whl
' ( !all:(?Unfolded.?rest)+?all
& mod$(!Unfolded,!(its.COMBMin)):?adr
& (!adr.!Unfolded.!rest)+!rij:?rij
)
& put$(str$(\n "struct quat Letter[ARRSIZE]={
"),"letter.cpp",APP)
& 0:?i
& -1:?curr
& ", ":?sep
& whl
' ( ( !curr:<!i
& !rij
: (?curr.?Unfolded.?Simple.?Capital.?Comment.?Full)+?rij
| !rij:(?.?)+?
)
& ( !i+1:!(its.COMBMin)&"};":?sep
|
)
& ( !curr:!i
& put
$ ( str
$ ( "{"
!Unfolded
","
!Simple
","
(!Capital:~|0)
","
( !fulls:?+[?offset+(!Full.?)+?
& "Full+" !offset
| 0L
)
"}"
!sep
"/*"
!Comment
"*/"
\n
)
, "letter.cpp"
, APP
)
| put$(str$("{0,0,0,0L}" !sep \n),"letter.cpp",APP)
)
& !i+1:?i
)
& whl
' ( !i+1:?i:~>!(its.COMBMin)
& ( !i:!(its.COMBMin)&"};":?sep
|
)
put$(str$("{0,0,0,0L}" !sep \n),"letter.cpp",APP)
)
)
( combine
=
. 0:?COMB
& out$combine
& !(its.Common):?L
& whl
' ( !L:(?unf.?fol.?com)+?L
& ( !unf
. !fol
. @(!com:? CAPITAL ?)&!unf
|
. !com
.
)
+ !COMB
: ?COMB
)
& !(its.Simple):?L
& whl
' ( !L:(?unf.?fol.?com)+?L
& ( !unf
. !fol
. @(!com:? CAPITAL ?)&!unf
|
. !com
.
)
+ !COMB
: ?COMB
)
& !(its.Turkic):?L
& whl
' ( !L:(?unf.?fol.?com)+?L
& ( !COMB:?A+(!unf.?ifol.?icap.?icom.)+?B
& ( !icom:!com
| out$(different comments !unf !icom and !com)
& get'
)
& !A+(!unf.!ifol.!icap.!com.!fol)+!B:?COMB
| ( !unf
. !fol
. @(!com:? CAPITAL ?)&!unf
|
. !com
.
)
+ !COMB
: ?COMB
)
)
& !(its.Full):?L
& whl
' ( !L:(?unf.?folfull.?com)+?L
& ( !COMB:?A+(!unf.?ifol.?icap.?icom.?ifolfull)+?B
& ( !ifolfull:
| out$(twice full !unf)&get'
)
& ( !icom:!com
| out$(different comments !unf !icom and !com)
& get'
)
& !A+(!unf.!ifol.!icap.!com.!folfull)+!B:?COMB
| ( !unf
. @(!com:? SMALL ?)&!unf
|
. @(!com:? CAPITAL ?)&!unf
|
. !com
. !folfull
)
+ !COMB
: ?COMB
)
)
& 0:?LOW
& !COMB:?L
& whl
' ( !L:(?iunf.?ifol.?icap.?icom.?ifolfull)+?L
& ( @(!icom:?A " CAPITAL" ?B)
& ( !COMB:?+(!ifol.?)+?
| !LOW:?+(!ifol.?)+?
| !ifolfull:%@:~
| (!ifol.!ifol.!icap.str$(!A !B).!ifolfull)+!LOW
: ?LOW
)
|
)
)
& !COMB:?L
& whl
' ( !L:(?iunf.?ifol.?icap.?icom.?ifolfull)+?L
& ( @(!icom:?A " CAPITAL" ?B)
& !ifolfull:#@
& ( !COMB:?+(!ifolfull.?)+?
| !LOW:?+(!ifolfull.?)+?
| (!ifolfull.!ifolfull.!icap.str$(!A !B).!ifolfull)
+ !LOW
: ?LOW
)
|
)
)
& lst$(LOW,LOW,NEW)
& !LOW+!COMB:?COMB
& lst$(COMB,COMB,NEW)
& !COMB
)
( do
=
. 0:?p
& "
0 Code value normative Code value. For characters in the range U+0000..U+FFFD the code value uses a 4-digit hexadecimal format; for characters in the range U+10000..U+FFFFD the code value uses a 5-digit hexadecimal format; and for characters in the range U+100000..U+10FFFD the code value uses a 6-digit hexadecimal format.
1 Character name normative These names match exactly the names published in Chapter 14 of the Unicode Standard, Version 3.0.
2 General Category normative/informative This is a useful breakdown into various character"
"types which can be used as a default categorization in implementations. See below for a brief explanation.
3 Canonical Combining Classes normative The classes used for the Canonical Ordering Algorithm in the Unicode Standard. These classes are also printed in Chapter 4 of the Unicode Standard.
4 Bidirectional Category normative See the list below for an explanation of the abbreviations used in this field. These are the categories required by the Bidirectional Behavior Algorithm in the Unicode Standard. These categories are summarized in Chapter 3 of the Unicode Standard.
5 Character Decomposition Mapping normative In the Unicode Standard, not all of the mappings are full (maximal) decompositions. Recursive application of look-up for decompositions will, in all cases, lead to a maximal decomposition. The decomposition mappings match exactly the decomposition mappings published with the character names in the Unicode Standard.
6 Decimal digit value normative This is a numeric field. If the character has the decimal digit property, as specified in Chapter 4 of the Unicode Standard, the value of that digit is represented with an integer value in this field
7 Digit value normative This is a numeric field. If the character represents a digit, not necessarily a decimal digit, the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits
8 Numeric value normative This is a numeric field. If the character has the numeric property, as specified in Chapter 4 of the Unicode Standard, the value of that character is represented with an integer or rational number in this field. This includes fractions as, e.g., 1/5 for U+2155 VULGAR FRACTION ONE FIFTH Also included are numerical values for compatibility characters such as circled numbers.
9 Mirrored normative If the character has been identified as a mirrored character in bidirectional text, this field has the value Y; otherwise N. The list of mirrored characters is also printed in Chapter 4 of the Unicode Standard.
10 Unicode 1.0 Name informative This is the old name as published in Unicode 1.0. This name is only provided when it is significantly different from the Unicode 3.0 name for the character.
11 10646 comment field informative This is the ISO 10646 comment field. It appears in parentheses in the 10646 names list, or contains an asterisk to mark an Annex P note.
12 Uppercase Mapping informative Upper case equivalent mapping. If a character is part of an alphabet with case distinctions, and has an upper case equivalent, then the upper case equivalent is in this field. See the explanation below on case distinctions. These mappings are always one-to-one, not one-to-many or many-to-one. This field is informative.
13 Lowercase Mapping informative Similar to Uppercase mapping
14 Titlecase Mapping informative Similar to Uppercase mapping
"
& :?K
& :?u2l
& :?l2u
& :?let
& @( !txt
: ( ?
( [!p ?line \n [?p ?
& @( !line
: ( ?"Code value"
";"
?"Character name"
";"
?"General Category"
";"
?"Canonical Combining Classes"
";"
?"Bidirectional Category"
";"
?"Character Decomposition Mapping"
";"
?"Decimal digit value"
";"
?"Digit value"
";"
?"Numeric value"
";"
?Mirrored
";"
?"Unicode 1.0 Name"
";"
?"10646 comment field"
";"
?"Uppercase Mapping"
";"
?"Lowercase Mapping"
";"
?"Titlecase Mapping"
& ( !"Lowercase Mapping":
| ( x2d$!"Code value":?V
. x2d$!"Lowercase Mapping":?W
. !W+-1*!V
. !"Code value"
. !"Lowercase Mapping"
)
!u2l
: ?u2l
)
& ( !"Uppercase Mapping":
| ( x2d$!"Code value":?V
. x2d$!"Uppercase Mapping":?W
. !W+-1*!V
. !"Code value"
. !"Uppercase Mapping"
)
!l2u
: ?l2u
)
& ( @(!"General Category":L ?k)
& (x2d$!"Code value".!"Code value".!"Character name")
!let
: ?let
& ( !K:? !k ?
| !k !K:?K&:?!k
)
& (!"Code value".!"Character name") !!k:?!k
& out$!"Code value"
|
)
| ?
)
)
& ~
)
| ?txt
)
)
& (lst$(K,K,NEW)|)
& (lst$(let,let,NEW)|)
& (lst$(u2l,u2l,NEW)|)
& (lst$(l2u,l2u,NEW)|)
& :?letters
& :?Cletters
& 0:?blocks
& \n:?nl
& whl
' ( !let
: (?d:?D&-1+!D:?D.?)
?
( (!D&-1+!D:?D.?) ?
& ~`
| ?let
)
& 1+!D:?D
& (d2x$!D.!d+-1*!D) !letters:?letters
& !nl "{0x" d2x$!D "," !d+-1*!D "}," !Cletters
: ?Cletters
& ( !blocks:7
& 0:?blocks
& \n:?nl
| 1+!blocks:?blocks&:?nl
)
)
& put
$ ( str
$ ( "/*
Based on UnicodeData.txt ("
!(X.UnicodeDataDate)
")
(Does not convert final sigma to non-final sigma)
Structures created with CaseFolding.bra
*/
struct cletter {int L:21;int range:11;};
struct cletter Cletters[]={"
)
, "cletters.cpp"
, NEW
)
& put$(str$!Cletters,"cletters.cpp",APP)
& put$("{0x1FFFFF,0}};
","cletters.cpp",APP)
& put
$ ( "
bool isAlpha(int a)
{
static int start_i = 0;
int i;
if(a <= 0)
return false;
if((unsigned int)a < Cletters[start_i].L)
{
for(i=0;;++i)
{
if((unsigned int)a < Cletters[i].L)
{
start_i = --i;
return (unsigned int)a <= Cletters[i].L+Cletters[i].range;
}
}
}
else
{
for(i=start_i+1;;++i)
{
if((unsigned int)a < Cletters[i].L)
{
start_i = --i;
return (unsigned int)a <= Cletters[i].L+Cletters[i].range;
}
}
}
return false;
}
"
, "cletters.cpp"
, APP
)
& lst$(letters,letters,NEW)
& ( comp
=
. !arg:(?arg.?name)
& put
$ ( str
$ ( "
/* Based on http://unicode.org/Public/UNIDATA/UnicodeData.txt "
!(X.UnicodeDataDate)
" */
/* structures created with CaseFolding.bra */
struct ccaseconv "
!name
"[]={\n"
)
, ("letter.cpp",APP)
| (str$(!name ".cpp"):?name,NEW)
)
& (pat1=!d1&-1+!d1:?d1)
& (pat2=!d2&-2+!d2:?d2)
& ( setpat
=
' ( $pat1
& 1:?dif
& '$pat1:(=?pat)
| $pat2
& 2:?dif
& '$pat2:(=?pat)
)
: (=?pat)
)
& ( "LETTER DECIMAL"
. "CASE CONVERTED LETTER DECIMAL"
. DELTA
. "LETTER HEX"
. "CASE CONVERTED LETTER HEX"
. INCREMENT
. RANGE
. "LAST LETTER HEX"
. "LAST CASE CONVERTED LETTER HEX"
)
: ?LL
& :?conv
& whl
' ( !setpat
& :?dd
& !arg
: ( ?d
& -1+!d:?d1
& -2+!d:?d2
. ?
. ?df
. ?
)
?
( ( ?ndd
. ?
. !df&!ndd:!pat:?dd
. ?
)
?
& ~`
| ?arg
& ( !dd:
& ( !d
. !df+!d
. !df
. d2x$!d:?let
. d2x$(!df+!d)
. 0:?inc
. 0:?range
. d2x$!d
. d2x$(!df+!d)
)
| ( !dd
. !df+!dd
. !df
. d2x$!dd:?let
. d2x$(!df+!dd)
. !dif:?inc
. !d+-1*!dd:?range
. d2x$!d
. d2x$(!df+!d)
)
)
!LL
: ?LL
& "{0x"
!let
","
!range
","
!inc
","
!df
"},\n"
!conv
: ?conv
)
)
& put$(str$!conv,"letter.cpp"|!name,APP)
& put
$ ("{0x1FFFFF,0,0,0}};
","letter.cpp"|!name,APP)
& !LL
)
& comp$(!l2u.l2u):?l2us
& lst$(l2us,l2us,NEW)
& comp$(!u2l.u2l):?u2ls
& lst$(u2ls,u2ls,NEW)
& whl'(!K:%?k ?K&lst$(!k,!k,NEW))
& put
$ ( str
$ ( "
/*
Based on UnicodeData.txt ("
!(X.UnicodeDataDate)
")
(Does not convert final sigma to non-final sigma)
Structures created with CaseFolding.bra
*/
struct cletter Cletters[]={"
)
, "letter.cpp"
, APP
)
& put$(str$!Cletters,"letter.cpp",APP)
& put$("{0x1FFFFF,0}};
","letter.cpp",APP)
&
)
( new
=
. ~
| 0
: ?(its.Small2Capital)
: ?(its.Full)
: ?(its.Simple)
: ?(its.Turkic)
: ?(its.Common)
& ( ~
| ( (its.readFile)$!(X.CaseFoldingFileName)
| (its.readFile)$"CaseFolding-6.0.0.txt"
| (its.readFile)$"CaseFolding-5.2.0.txt"
| (its.readFile)$"CaseFolding-3.2.0.txt"
)
& out$readFileDone
& (its.combine)$:?(its.COMB)
& (its.count)$!(its.COMB):?(its.COMBN)
& out$("(its.COMBN)" !(its.COMBN))
& (its.store)$!(its.COMB)
: (?(its.COMBMin).?)
& out$("(its.COMBMin)" !(its.COMBMin))
)
& (its.2html)$
& out$2htmlDone
& (its.writeC)$
& get$("UnicodeData.txt",STR):?txt
& (its.do)$
& DONE
);
r=
get'"CaseFolding.bra"
& rmv$"CaseFolding.bak"
& ren$("CaseFolding.bra"."CaseFolding.bak")
& put
$ ( "{CaseFolding.bra
Read CaseFolding-7.0.0.txt and convert to Full and Simple tables in C}
"
, "CaseFolding.bra"
, NEW
)
& lst'(X,"CaseFolding.bra",APP)
& put'(\n,"CaseFolding.bra",APP)
& lst'(r,"CaseFolding.bra",APP)
& put$(str$("\nnew'" X "&;\n"),"CaseFolding.bra",APP)
& done;
new'X&;
Jump to Line
Something went wrong with that request. Please try again.