Skip to content

Commit

Permalink
upgraded to 15.1.0 & added indic_conjunct_break
Browse files Browse the repository at this point in the history
  • Loading branch information
iwsfutcmd committed Oct 18, 2023
1 parent 4ba7cc1 commit 4848bff
Show file tree
Hide file tree
Showing 15 changed files with 24,434 additions and 23,779 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ target/
*13.0.0.*
*14.0.0.*
*15.0.0.*
*15.1.0.*
tests/data

# Clinic scripts
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 15.1.0
- Upgrade to Unicode 15.1.0
- Add indic_conjunct_break property

## 15.0.0-2
- Add age property

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Currently supported additional Unicode properties:
* Script: `script(chr)`
* Script Extensions: `script_extensions(chr)`
* Block: `block(chr)`
* Indic Conjunct Break: `indic_conjunct_break(chr)`
* Indic Positional Category: `indic_positional_category(chr)`
* Indic Syllabic Category: `indic_syllabic_category(chr)`
* Grapheme Cluster Break: `grapheme_cluster_break(chr)`
Expand Down
50 changes: 35 additions & 15 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "15.0.0"
UNIDATA_VERSION = "15.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
Expand Down Expand Up @@ -114,15 +114,16 @@

# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DBF'),
('4E00', '9FFF'),
('20000', '2A6DF'),
('2A700', '2B739'),
('2B740', '2B81D'),
('2B820', '2CEA1'),
('2CEB0', '2EBE0'),
('30000', '3134A'),
('31350', '323AF'),
('3400', '4DBF'), # CJK Ideograph Extension A CJK
('4E00', '9FFF'), # CJK Ideograph
('20000', '2A6DF'), # CJK Ideograph Extension B
('2A700', '2B739'), # CJK Ideograph Extension C
('2B740', '2B81D'), # CJK Ideograph Extension D
('2B820', '2CEA1'), # CJK Ideograph Extension E
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
('30000', '3134A'), # CJK Ideograph Extension G
('31350', '323AF'), # CJK Ideograph Extension H
]


Expand Down Expand Up @@ -978,7 +979,7 @@ def word_key(a):

def makeunicodeprop(unicode, trace):

dummy = (0, 0, 0, 0, 0, 0, 0, 0)
dummy = (0, 0, 0, 0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
Expand All @@ -993,12 +994,13 @@ def makeunicodeprop(unicode, trace):
script = unicode.scripts.index(record.script)
block = unicode.blocks.index(record.block)
script_extensions = unicode.script_extensions.index(record.script_extensions)
indic_conjunct_break = unicode.indic_conjunct_break.index(record.indic_conjunct_break)
indic_positional = unicode.indic_positional.index(record.indic_positional)
indic_syllabic = unicode.indic_syllabic.index(record.indic_syllabic)
grapheme_cluster_break = unicode.grapheme_cluster_break.index(record.grapheme_cluster_break)
vertical_orientation = unicode.vertical_orientation.index(record.vertical_orientation)
age = unicode.age.index(record.age)
item = (script, block, script_extensions, indic_positional, indic_syllabic, grapheme_cluster_break, vertical_orientation, age)
item = (script, block, script_extensions, indic_conjunct_break, indic_positional, indic_syllabic, grapheme_cluster_break, vertical_orientation, age)
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
Expand All @@ -1018,7 +1020,7 @@ def makeunicodeprop(unicode, trace):
fprint("/* a list of unique unicode property sets */")
fprint("static const _PyUnicodePlus_PropertySet _PyUnicodePlus_Property_Sets[] = {")
for item in table:
fprint(" {%d, %d, %d, %d, %d, %d, %d, %d}," % item)
fprint(" {%d, %d, %d, %d, %d, %d, %d, %d, %d}," % item)
fprint("};")
fprint()

Expand All @@ -1041,6 +1043,12 @@ def makeunicodeprop(unicode, trace):
fprint(" NULL")
fprint("};")

fprint("static const char *_PyUnicodePlus_IndicConjunctBreakNames[] = {")
for name in unicode.indic_conjunct_break:
fprint(" \"%s\"," % name)
fprint(" NULL")
fprint("};")

fprint("static const char *_PyUnicodePlus_IndicPositionalCategoryNames[] = {")
for name in unicode.indic_positional:
fprint(" \"%s\"," % name)
Expand Down Expand Up @@ -1419,11 +1427,23 @@ def __init__(self, version, cjk_check=True):
table[i].east_asian_width = widths[i]
self.widths = widths

for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
indic_conjunct_break = ["None"] * 0x110000

for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if propname == "InCB":
indic_conjunct_break[char] = propinfo[0]
continue

if table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
table[char].binary_properties.add(p)
table[char].binary_properties.add(propname)

self.indic_conjunct_break = ["None"] + sorted(set(indic_conjunct_break) - {"None"})

for i in range(0, 0x110000):
if table[i] is not None:
table[i].indic_conjunct_break = indic_conjunct_break[i]

self.property_value_aliases = {}
for prop, value, *aliases in UcdFile(PROPERTY_VALUE_ALIASES, version):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name="unicodedataplus",
version="15.0.0-2",
version="15.1.0",
description="Unicodedata with extensions for additional properties.",
ext_modules=[main_module],
author="Ben Yang",
Expand Down
15 changes: 13 additions & 2 deletions tests/test_unicodedataplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
expectedchecksum = '232affd2a50ec4bd69d2482aa0291385cbdefaba'

def test_function_checksum(self):
data = []
Expand Down Expand Up @@ -195,6 +195,7 @@ def test_east_asian_width(self):
self.assertEqual(eaw('\U00020000'), 'W')
self.assertEqual(eaw('\U0002B737'), 'W')
self.assertEqual(eaw('\U00031414'), 'W')
self.assertEqual(eaw('\U0002ECCA'), 'W')

def test_east_asian_width_unassigned(self):
eaw = self.db.east_asian_width
Expand Down Expand Up @@ -223,6 +224,7 @@ def test_script(self):
self.assertEqual(self.db.script('\U00011013'), 'Brahmi')
self.assertEqual(self.db.script('\U00010583'), 'Vithkuqi')
self.assertEqual(self.db.script('\U0001E4E0'), 'Nag_Mundari')
self.assertEqual(self.db.script('\U0002EB01'), 'Han')
self.assertEqual(self.db.script('\u1AFF'), 'Unknown')

def test_block(self):
Expand All @@ -231,13 +233,21 @@ def test_block(self):
self.assertEqual(self.db.block('\U00010107'), 'Aegean Numbers')
self.assertEqual(self.db.block('\U00012FE4'), 'Cypro-Minoan')
self.assertEqual(self.db.block('\U0001D2C2'), 'Kaktovik Numerals')
self.assertEqual(self.db.block('\U0002ED32'), 'CJK Unified Ideographs Extension I')
self.assertEqual(self.db.block('\u1AFF'), 'No_Block')

def test_script_extensions(self):
self.assertEqual(self.db.script_extensions('P'), ['Latn'])
self.assertEqual(self.db.script_extensions('\u0640'), ['Adlm', 'Arab', 'Mand', 'Mani', 'Ougr', 'Phlp', 'Rohg', 'Sogd', 'Syrc'])
self.assertEqual(self.db.script_extensions('\u1AFF'), ['Zzzz'])
self.assertEqual(self.db.script_extensions('\U0001E290'), ['Toto'])
self.assertEqual(self.db.script_extensions('\U0002EE11'), ['Hani'])

def test_indic_conjunct_break(self):
self.assertEqual(self.db.indic_conjunct_break('P'), 'None')
self.assertEqual(self.db.indic_conjunct_break('\u0B4D'), 'Linker')
self.assertEqual(self.db.indic_conjunct_break('\u0AB7'), 'Consonant')
self.assertEqual(self.db.indic_conjunct_break('\u089C'), 'Extend')

def test_indic_positional(self):
self.assertEqual(self.db.indic_positional_category('P'), 'NA')
Expand Down Expand Up @@ -297,14 +307,15 @@ def test_age(self):
self.assertEqual(self.db.age('\u0EAC'), '12.0')
self.assertEqual(self.db.age('\U0002A6D9'), '13.0')
self.assertEqual(self.db.age('\u170D'), '14.0')

self.assertEqual(self.db.age('\U0002EBF9'), '15.1')

def test_total_strokes(self):
self.assertEqual(self.db.total_strokes('P'), 0)
self.assertEqual(self.db.total_strokes('\u694A'), 13)
self.assertEqual(self.db.total_strokes('\U0002003E'), 10)
self.assertEqual(self.db.total_strokes('\U0002B736'), 16)
self.assertEqual(self.db.total_strokes('\U0003137B'), 6)
self.assertEqual(self.db.total_strokes('\U0002ED6B'), 8)

def test_emoji(self):
self.assertEqual(self.db.is_emoji('\u00A9'), True)
Expand Down
33 changes: 33 additions & 0 deletions unicodedataplus/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ typedef struct {
_PyUnicodePlus_Block */
const int script_extensions; /* index into
_PyUnicodePlus_Script_Extensions */
const int indic_conjunct_break; /* index into
_PyUnicodePlus_Indic_Conjunct_Break */
const int indic_positional_category; /* index into
_PyUnicodePlus_Indic_Positional_Category */
const int indic_syllabic_category; /* index into
Expand Down Expand Up @@ -92,6 +94,7 @@ typedef struct change_record {
const unsigned char script_changed;
const unsigned char block_changed;
const unsigned char script_extensions_changed;
const unsigned char indic_conjunct_break_changed;
const unsigned char indic_positional_category_changed;
const unsigned char indic_syllabic_category_changed;
const unsigned char grapheme_cluster_break_changed;
Expand Down Expand Up @@ -601,6 +604,34 @@ unicodedata_UCD_script_extensions_impl(PyObject *self, int chr)
return se_list;
}

/*[clinic input]
unicodedata.UCD.indic_conjunct_break
self: self
chr: int(accept={str})
/
Returns the Indic Conjunct Break category of the character chr as string.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_indic_conjunct_break_impl(PyObject *self, int chr)
/*[clinic end generated code: output=0c9e917743dd8ff3 input=e544000ccfd4e991]*/

{
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getpropset_ex(c)->indic_conjunct_break;
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->indic_conjunct_break_changed != 0xFF)
index = old->indic_conjunct_break_changed;
}
return PyUnicode_FromString(_PyUnicodePlus_IndicConjunctBreakNames[index]);
}

/*[clinic input]
unicodedata.UCD.indic_positional_category
Expand Down Expand Up @@ -1418,6 +1449,7 @@ is_unified_ideograph(Py_UCS4 code)
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
}
Expand Down Expand Up @@ -2012,6 +2044,7 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_SCRIPT_METHODDEF
UNICODEDATA_UCD_BLOCK_METHODDEF
UNICODEDATA_UCD_SCRIPT_EXTENSIONS_METHODDEF
UNICODEDATA_UCD_INDIC_CONJUNCT_BREAK_METHODDEF
UNICODEDATA_UCD_INDIC_POSITIONAL_CATEGORY_METHODDEF
UNICODEDATA_UCD_INDIC_SYLLABIC_CATEGORY_METHODDEF
UNICODEDATA_UCD_GRAPHEME_CLUSTER_BREAK_METHODDEF
Expand Down
29 changes: 28 additions & 1 deletion unicodedataplus/unicodedata.c.37.h
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,33 @@ unicodedata_UCD_script_extensions(PyObject *self, PyObject *arg)
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_conjunct_break__doc__,
"indic_conjunct_break($self, chr, /)\n"
"--\n"
"\n"
"Returns the Indic Conjunct Break category of the character chr as string.");

#define UNICODEDATA_UCD_INDIC_CONJUNCT_BREAK_METHODDEF \
{"indic_conjunct_break", (PyCFunction)unicodedata_UCD_indic_conjunct_break, METH_O, unicodedata_UCD_indic_conjunct_break__doc__},

static PyObject *
unicodedata_UCD_indic_conjunct_break_impl(PyObject *self, int chr);

static PyObject *
unicodedata_UCD_indic_conjunct_break(PyObject *self, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;

if (!PyArg_Parse(arg, "C:indic_conjunct_break", &chr)) {
goto exit;
}
return_value = unicodedata_UCD_indic_conjunct_break_impl(self, chr);

exit:
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_positional_category__doc__,
"indic_positional_category($self, chr, /)\n"
"--\n"
Expand Down Expand Up @@ -818,4 +845,4 @@ unicodedata_UCD_is_extended_pictographic(PyObject *self, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=786b068a16dce3a5 input=a9049054013a1b77]*/
/*[clinic end generated code: output=51f29f2b53840394 input=a9049054013a1b77]*/
38 changes: 37 additions & 1 deletion unicodedataplus/unicodedata.c.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,42 @@ unicodedata_UCD_script_extensions(PyObject *self, PyObject *arg)
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_conjunct_break__doc__,
"indic_conjunct_break($self, chr, /)\n"
"--\n"
"\n"
"Returns the Indic Conjunct Break category of the character chr as string.");

#define UNICODEDATA_UCD_INDIC_CONJUNCT_BREAK_METHODDEF \
{"indic_conjunct_break", (PyCFunction)unicodedata_UCD_indic_conjunct_break, METH_O, unicodedata_UCD_indic_conjunct_break__doc__},

static PyObject *
unicodedata_UCD_indic_conjunct_break_impl(PyObject *self, int chr);

static PyObject *
unicodedata_UCD_indic_conjunct_break(PyObject *self, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;

if (!PyUnicode_Check(arg)) {
_PyArg_BadArgument("indic_conjunct_break", "argument", "a unicode character", arg);
goto exit;
}
if (PyUnicode_READY(arg)) {
goto exit;
}
if (PyUnicode_GET_LENGTH(arg) != 1) {
_PyArg_BadArgument("indic_conjunct_break", "argument", "a unicode character", arg);
goto exit;
}
chr = PyUnicode_READ_CHAR(arg, 0);
return_value = unicodedata_UCD_indic_conjunct_break_impl(self, chr);

exit:
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_positional_category__doc__,
"indic_positional_category($self, chr, /)\n"
"--\n"
Expand Down Expand Up @@ -1101,4 +1137,4 @@ unicodedata_UCD_is_extended_pictographic(PyObject *self, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=652110d3ec494e7a input=a9049054013a1b77]*/
/*[clinic end generated code: output=e76814022b816b64 input=a9049054013a1b77]*/
28 changes: 27 additions & 1 deletion unicodedataplus/unicodedata.c.pypy.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,32 @@ unicodedata_UCD_script_extensions(PyObject *self, PyObject *arg)
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_conjunct_break__doc__,
"indic_conjunct_break($self, chr, /)\n"
"--\n"
"\n"
"Returns the Indic Conjunct Break category of the character chr as string.");

#define UNICODEDATA_UCD_INDIC_CONJUNCT_BREAK_METHODDEF \
{"indic_conjunct_break", (PyCFunction)unicodedata_UCD_indic_conjunct_break, METH_O, unicodedata_UCD_indic_conjunct_break__doc__},

static PyObject *
unicodedata_UCD_indic_conjunct_break_impl(PyObject *self, int chr);

static PyObject *
unicodedata_UCD_indic_conjunct_break(PyObject *self, PyObject *arg)
{
PyObject *return_value = NULL;
int chr;

if (!PyArg_Parse(arg, "C:indic_conjunct_break", &chr))
goto exit;
return_value = unicodedata_UCD_indic_conjunct_break_impl(self, chr);

exit:
return return_value;
}

PyDoc_STRVAR(unicodedata_UCD_indic_positional_category__doc__,
"indic_positional_category($self, chr, /)\n"
"--\n"
Expand Down Expand Up @@ -790,4 +816,4 @@ unicodedata_UCD_is_extended_pictographic(PyObject *self, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=b84333aeb6f0ccb5 input=a9049054013a1b77]*/
/*[clinic end generated code: output=9776cf13724786f8 input=a9049054013a1b77]*/

0 comments on commit 4848bff

Please sign in to comment.