# Chinese characters lists

In [1]:
import os, re
import pandas as pd

def stats(series):
    print('    | %5s | %6s | %6s |' % ('Level', 'New', 'Total'))
    print('    | ----- | ------ | ------ |')
    tot = 0
    assert len(set(series.index)) == len(series)
    for g, cnt in series.value_counts().sort_index().items():
        tot += cnt
        print('    | %5s | %6d | %6d |' % (g, cnt, tot))

def word_to_char_list(trad, level):
    res = {}
    ign = set()
    for l,i,w in sorted([(l,i,w) for (i,(w,l)) in enumerate(zip(trad, level))]):
        for c in w:
            if ord(c) < 256 or c in '（）。？！…、，＝': continue
            if ord(c) < 0x4E00:
                ign.add(c)
                continue
            res[c] = min(res.get(c, l), l)
    res = pd.Series(res, name='level').rename_axis('char')
    if ign:
        print('Ignored:', ''.join(ign))
    return res

unihan_df = pd.read_csv('../unihan/unihan.csv',  dtype='str')

## Traditional (TW)

**`tocfl.csv`: characters by TOCFL 2022/2023 wordlist levels**
- TOCFL level at which a character appears in the wordlist.
- 2562 characters over 6 levels L0=Novice(pre-A1), L1..L5=CEFR A1/A2/B1/B2/C1+

In [2]:
df = pd.read_csv('../tocfl/tocfl.csv')
tocfl = word_to_char_list(df.Traditional, df.ID.str.slice(1, 2))
tocfl.to_csv('tocfl.csv')
stats(tocfl)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     0 |    375 |    375 |
    |     1 |    247 |    622 |
    |     2 |    267 |    889 |
    |     3 |    447 |   1336 |
    |     4 |    629 |   1965 |
    |     5 |    597 |   2562 |


**`tocfl-2018.csv`: characters by TOCFL 2018 wordlist levels**

In [3]:
df = pd.read_csv('../tocfl/tocfl-2018.csv')
tocfl = word_to_char_list(df.Traditional, df.ID.str.slice(1, 2))
tocfl.to_csv('tocfl-2018.csv')
stats(tocfl)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     0 |    326 |    326 |
    |     1 |    162 |    488 |
    |     2 |    298 |    786 |
    |     3 |    518 |   1304 |
    |     4 |    633 |   1937 |
    |     5 |    618 |   2555 |


**`cccc.csv`: characters by CCCC 2022 wordlist levels.**

In [4]:
df = pd.read_csv('../tocfl/cccc.csv')
cccc = word_to_char_list(df.Traditional, df.ID.str.slice(1, 2))
cccc.to_csv('cccc.csv')
stats(cccc)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     0 |    462 |    462 |
    |     1 |    299 |    761 |
    |     2 |    285 |   1046 |


**`tbcl-chars.csv`: characters by TBCL character level**
  - From their official character list: https://coct.naer.edu.tw/TBCL/
  - 3133 characters (including variants) / 3100 excl. variants in original list:

In [5]:
df = pd.read_csv('../tbcl/chars-expanded.csv', dtype='str')
tbcl_chars = word_to_char_list(df.char, df.Level.str.slice(0, 1))
tbcl_chars.to_csv('tbcl-chars.csv')
stats(tbcl_chars)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    249 |    249 |
    |     2 |    262 |    511 |
    |     3 |    300 |    811 |
    |     4 |    507 |   1318 |
    |     5 |    604 |   1922 |
    |     6 |    607 |   2529 |
    |     7 |    604 |   3133 |


**`tbcl-words.csv`: characters by TBCL wordlist level**
  - TBCL word level at which a character appears in a wordlist.
  - Note it's a bit different from their official character lists.

In [6]:
df = pd.read_csv('../tbcl/tbcl-expanded.csv', dtype='str')
tbcl_words = word_to_char_list(df.Traditional, df.Level.str.slice(0, 1)).to_frame()
tbcl_words['in_tbcl_chars'] = tbcl_words.index.isin(tbcl_chars.index).astype(int)
tbcl_words.to_csv('tbcl-words.csv')
stats(tbcl_words.level)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    361 |    361 |
    |     2 |    263 |    624 |
    |     3 |    264 |    888 |
    |     4 |    532 |   1420 |
    |     5 |    647 |   2067 |
    |     6 |    764 |   2831 |
    |     7 |    743 |   3574 |


In [7]:
# In TBCL chars but not words..?
''.join(set(tbcl_chars.index)-set(tbcl_words.index))

'豚喬鶯膿曹綵霞菇値痔橡痘麪芹趾蠶痰荔璧渣藻茉遛昌飩栗猿踪饗劉勘鋁弘妮鑒埔褒槳亭糯萍蟬甄弧蝸洛婷萱鈣嘛閩嘍簿呂韓峯淤蕾齋棕鳩橙絮涮范朶鑄睏矇盧贓鮪刨蔣糗耶鉢眞粵凱芋矢蛀芭餡磁楓昭趙尼笛硯蜀雛餛却痣擧娟烟燴垢郭欸礁鷄億蜘莉躱擂缽疹烘嬸澳蓓楊孟洲篆魷啓俄吳鵲柚菠纜岔愼蘇粧蚵荐强牀魏跛臘砥菲礪箏鯨砂彥'

**`dangdai.csv`, `modernchinese.csv`, `pavc.csv`: characters in taiwanese Mandarin textbooks**

In [8]:
df = pd.read_csv('../dangdai/dangdai.csv')
dangdai = word_to_char_list(df.Traditional, df.ID.str.slice(1, 2))
dangdai.to_csv('dangdai.csv')
stats(dangdai)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    491 |    491 |
    |     2 |    382 |    873 |
    |     3 |    396 |   1269 |
    |     4 |    318 |   1587 |
    |     5 |    231 |   1818 |
    |     6 |    235 |   2053 |


In [9]:
df = pd.read_csv('../modernchinese/modernchinese.csv')
modernchinese = word_to_char_list(df.Traditional, df.ID.str.slice(1, 2))
modernchinese.to_csv('modernchinese.csv')
stats(modernchinese)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    656 |    656 |
    |     2 |    401 |   1057 |
    |     3 |    323 |   1380 |
    |     4 |    297 |   1677 |


In [10]:
df = pd.read_csv('../pavc/pavc.csv')
df = df[df.Meaning.str.contains('PAVC-')]
df['level'] = [min(re.findall('PAVC-(.)', s)) for s in df.Meaning]
pavc = word_to_char_list(df.Traditional, df.level)
pavc.to_csv('pavc.csv')
stats(pavc)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    346 |    346 |
    |     2 |    336 |    682 |
    |     3 |    692 |   1374 |
    |     4 |    454 |   1828 |
    |     5 |    433 |   2261 |


**`tw.csv`: characters by taiwanese school textbooks grade**
  - From https://web.archive.org/web/20111215143140/http://residence.educities.edu.tw/wei3128/currinstruc/wordclause/generwordgrd9.htm
  - https://chinese.stackexchange.com/questions/6200/table-of-chinese-characters-taught-in-primary-school-grouped-by-grade
  - Based on textbooks content analysis, otherwise no official/government source for this.
  - 5568 characters over 9 grades
  - Related study: [國立臺中教育大學語文教育研究所碩士論文](https://ntcuir.ntcu.edu.tw/bitstream/987654321/6530/1/098NTCTC461033-001.pdf).

In [11]:
src = 'data/tw-wei3128.txt'
if os.path.exists(src):
    tw = {}
    lv = 0
    k = 0

    for line in open(src):
        line = line.strip()
        if m := re.search('第.級通用字彙.*共([0-9]+)個', line):
            lv += 1
            print(line)
            k += int(m[1])
        elif lv >= 1:
            for c in line:
                if ord(c) < 256 or c in ' 、': continue
                assert c not in tw, (line,c)
                tw[c] = lv
            #print(lv, line, len([c for c in tw if tw[c]==lv]))

    print('expected %d\n' % k)

    tw = pd.Series(tw, name='level').rename_axis('char')
    tw.to_csv('tw.csv')

    stats(tw)

第一級通用字彙【共198個】
第二級通用字彙【共166個】
第三級通用字彙【共267個】
第四級通用字彙【共351個】
第五級通用字彙【共421個】
第六級通用字彙【共548個】
第七級通用字彙【共616個】
第八級通用字彙【共956個】
第九級通用字彙【共2042個】
expected 5565

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    198 |    198 |
    |     2 |    175 |    373 |
    |     3 |    266 |    639 |
    |     4 |    350 |    989 |
    |     5 |    420 |   1409 |
    |     6 |    547 |   1956 |
    |     7 |    615 |   2571 |
    |     8 |    955 |   3526 |
    |     9 |   2042 |   5568 |


## Traditional (HK)

**`hk-unihan.csv`: characters by Hong Kong primary grade level from Unihan's kGradeLevel (incomplete)**
- https://www.unicode.org/reports/tr38/#kGradeLevel
- Note Unihan has a very incomplete list, e.g. 書 missing
- Current reference seems to be "香港小學學習字詞表" with 3171 characters.
  - https://www.edbchinese.hk/lexlist_ch/
  - Only two levels per char? KS1 / KS2
  - Non graded list https://zh-tw.sayjack.com/chinese/traditional-chinese/hk3171/level:3171/

In [12]:
df = unihan_df[unihan_df.kGradeLevel.notnull()]
hk = word_to_char_list(df.char, df.kGradeLevel)
hk.to_csv('hk-unihan.csv')
stats(hk)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    460 |    460 |
    |     2 |    510 |    970 |
    |     3 |    543 |   1513 |
    |     4 |    598 |   2111 |
    |     5 |    259 |   2370 |
    |     6 |    262 |   2632 |


**`hk-3171.csv`: current HK primary school list**
  - 香港小學學習字詞表
  - https://www.edbchinese.hk/lexlist_ch/
  - https://zh-tw.sayjack.com/chinese/traditional-chinese/hk3171/level:3171/
  - 3171 characters, no levels

In [13]:
df = pd.read_csv('hk-3171.csv').set_index('char')
stats(df.level)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     0 |   3171 |   3171 |


## Simplified

**`tgh.csv`: [Table of General Standard Chinese Characters](https://en.wikipedia.org/wiki/Table_of_General_Standard_Chinese_Characters) ([通用规范汉字表](https://www.gov.cn/gzdt/att/att/site1/20130819/tygfhzb.pdf), Tōngyòng Guīfàn Hànzì Biǎo).**
  - Official standard list of common simplified characters by PRC's Ministry of Education (2013), latest such standard currently.
  - Data from [Unihan](https://www.unicode.org/reports/tr38/#kTGH) database, including pinyin readings.

In [14]:
# From Unihan database incl. pinyin
df = unihan_df[lambda X: X.kTGH.notnull()][['char', 'kTGH', 'kTGHZ2013', 'kIRG_GSource']].copy()
df['tgh_index'] = df.kTGH.str.extract('^20[0-9]{2}:([1-9][0-9]{0,3})$').astype(int)
df['level'] = 3
df.loc[df['tgh_index'] <= 6500, 'level'] = 2
df.loc[df['tgh_index'] <= 3500, 'level'] = 1
df['pinyin'] = [re.sub(r' *[0-9]{3}\.[0-9]{3}(,[0-9]{3}\.[0-9]{3})*:', ' ', s).strip() for s in df.kTGHZ2013]
df = df[['char', 'level', 'tgh_index', 'pinyin']].sort_values('tgh_index').reset_index(drop=True)
assert len(df) == 8105 and list(df.tgh_index) == list(range(1, 8105+1))
tgh_df = df
tgh = tgh_df.set_index('char').level
tgh_df.to_csv('tgh.csv', index=False)
stats(tgh)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |   3500 |   3500 |
    |     2 |   3000 |   6500 |
    |     3 |   1605 |   8105 |


**`hsk30.csv`: characters in HSK 3.0 wordlist by level at which they first appear**


In [15]:
df = pd.read_csv('../hsk30/hsk30-expanded.csv')
hsk30 = word_to_char_list(df.Simplified, df.Level)
hsk30.to_csv('hsk30.csv')
stats(hsk30)

Ignored: 〇
    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    300 |    300 |
    |     2 |    300 |    600 |
    |     3 |    300 |    900 |
    |     4 |    300 |   1200 |
    |     5 |    300 |   1500 |
    |     6 |    300 |   1800 |
    |   7-9 |   1171 |   2971 |


In [16]:
# Almost all from TGH level 1:
hsk30_df = hsk30.to_frame()
hsk30_df['tgh_level'] = [tgh[c] for c in hsk30_df.index]
hsk30_df.tgh_level.value_counts()

tgh_level
1    2938
2      33
Name: count, dtype: int64

**`cn2460.csv`: random unofficial list "小学1-6年级生字表" from the internet**
  - Simplified characters by 6 primary school grades.
  - Mostly TGH level 1 characters with a few dozen level 2 characters.

In [17]:
src = 'data/cn2460.txt'
if os.path.exists(src):
    xx = {}
    lv = 0
    k = 0
    tgh_py = tgh_df.set_index('char').pinyin.to_dict()
    for line in open(src):
        line = line.strip()
        #print(line)
        if m := re.match('(.)年级(上|下).*： *([0-9]+)个.*',line):
            if m[2] == '上': lv += 1
            #print(lv, line)
            k += int(m[3])
        elif lv >= 1:
            line = re.sub('^[0-9]+ *、', '', line).strip()
            line = line.replace(')', ') ')
            for part in line.split():
                m = re.match(r'^(.)\(([^) ]+)\)$', part)
                assert m, line
                c, py = m[1], m[2]
                if c in xx and xx[c] != lv:
                    #print('  Repeated %c from level %d' % (c, xx[c]))
                    continue
                if c not in tgh_py:
                    #print('  Ignoring non-TGH char: %c' % c)
                    continue
                #if py not in tgh_py[c].split():
                #    print('  Pinyin for %s [%s] != TGH [%s]' % (c, py, tgh_py[c]))
                xx[c] = lv
            #print(lv, line, len([c for c in xx if xx[c]==lv]))

    xx = pd.Series(xx, name='level').sort_values().rename_axis('char')
    xx.to_csv('cn2460.csv')
    #print('Expected %d\n' % k)

    print('Distribution by TGH level:')
    print(pd.Series([tgh.loc[c] for c in set(xx.index)]).value_counts(), '\n')

    stats(xx)

Distribution by TGH level:
1    2424
2      43
Name: count, dtype: int64 

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    347 |    347 |
    |     2 |    640 |    987 |
    |     3 |    594 |   1581 |
    |     4 |    398 |   1979 |
    |     5 |    297 |   2276 |
    |     6 |    191 |   2467 |


## Misc

**`kanji.csv`: japanese kanji by school grade level.**
- Not exactly chinese, but close to traditional characters, with levels specified by government.
- Grade levels (2020) from [wikipedia](https://en.wikipedia.org/wiki/List_of_j%C5%8Dy%C5%8D_kanji), characters cross-referenced against Unihan's kJoyoKanji.
- 2136 characters ([jōyō kanji](https://en.wikipedia.org/wiki/List_of_j%C5%8Dy%C5%8D_kanji)) over 6 elementary school grades ([kyōiku kanji](https://en.wikipedia.org/wiki/Ky%C5%8Diku_kanji)) + 1110 secondary school kanji.
- https://en.wikipedia.org/wiki/List_of_j%C5%8Dy%C5%8D_kanji
- https://en.wikipedia.org/wiki/Ky%C5%8Diku_kanji

In [18]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_j%C5%8Dy%C5%8D_kanji')[2]
df = df.rename(columns={
    'New (Shinjitai)': 'shinjitai',
    'Old (Kyūjitai)': 'kyujitai',
    'Grade': 'level',
})
df['shinjitai'] = df.shinjitai.str.replace('\xa0.*', '',regex=True)
df['kyujitai'] = df.kyujitai.fillna('').str.replace('\xa0.*', '',regex=True)
df['level'] = df.level.str.replace('S', '7')
assert set(df.shinjitai) == set(unihan_df[lambda X: X.kJoyoKanji == '2010'].char)
df['char'] = df['shinjitai']
df = df[['char', 'kyujitai', 'level']].set_index('char')
df.to_csv('kanji.csv')
df.level.value_counts().sort_index()
kanji = df.level
stats(kanji)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |     80 |     80 |
    |     2 |    160 |    240 |
    |     3 |    200 |    440 |
    |     4 |    202 |    642 |
    |     5 |    193 |    835 |
    |     6 |    191 |   1026 |
    |     7 |   1110 |   2136 |


**`hanja.csv`: korean educational hanjas.**
  - https://en.wikipedia.org/wiki/Basic_Hanja_for_educational_use
  - Characters from Unihan's kKoreanEducationHanja, grades and some variant characters from [wikipedia](https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD_%EC%A4%91%EA%B3%A0%EB%93%B1%ED%95%99%EA%B5%90_%EA%B8%B0%EC%B4%88%ED%95%9C%EC%9E%90_%EB%AA%A9%EB%A1%9D)
  - 1800 characters over two levels (grade 7-9 / 10-12)

In [19]:
df = pd.read_html('https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD_%EC%A4%91%EA%B3%A0%EB%93%B1%ED%95%99%EA%B5%90_%EA%B8%B0%EC%B4%88%ED%95%9C%EC%9E%90_%EB%AA%A9%EB%A1%9D')[1]
l1 = re.sub('[\xa0 ·]', '', ''.join(df['중학교용'].fillna('')))
l2 = re.sub('[\xa0 ·]', '', ''.join(df['고등학교용'].fillna('')))
assert len(l1) == len(set(l1)) == 900
assert len(l2) == len(set(l2)) == 900
assert len(set(l1+l2)) == 1800
l1 = set(l1)
l2 = set(l2)

unihan = set(unihan_df[lambda X: X.kKoreanEducationHanja == '2007'].char)
assert len(unihan) == 1800

print('wikipedia missing:', unihan-(l1|l2))
print('wikipedia extra:  ', (l1|l2)-unihan)

wikipedia missing: {'茲', '戲', '産', '晚'}
wikipedia extra:   {'晩', '玆', '戱', '產'}


In [20]:
rows = []
for c in sorted(unihan):
    variant = {'茲':'玆', '晚':'晩', '戲':'戱', '産':'產'}.get(c, '')
    assert (c in l1 or variant in l1 or c in l2 or variant in l2)
    rows.append({
        'char': c,
        'variant': variant,
        'level': 1 if (c in l1 or variant in l1) else 2
    })

df = pd.DataFrame(rows).sort_values(['level', 'char']).set_index('char')
df.to_csv('hanja.csv')
hanja = df.level
stats(hanja)

    | Level |    New |  Total |
    | ----- | ------ | ------ |
    |     1 |    900 |    900 |
    |     2 |    900 |   1800 |


**kanji/hanja vs trad/simp**

In [21]:
trad = set()
for name in ['tocfl', 'tocfl-2018', 'cccc', 'tbcl-chars', 'tbcl-words', 'dangdai', 'modernchinese', 'pavc', 'tw']:
    trad |= set(pd.read_csv(f'{name}.csv').char)
simp = set(pd.read_csv('tgh.csv').char)
cmn = trad & simp
trad -= cmn
simp -= cmn
print('trad %d / simp %d / common %d' % (len(trad), len(simp), len(cmn)))
print('kanji: trad %d / simp %d / common %d / other %d' % (len(set(kanji.index) & trad), len(set(kanji.index) & simp), len(set(kanji.index) & cmn), len(set(kanji.index) - (trad | simp | cmn))))
print('hanja: trad %d / simp %d / common %d / other %d' % (len(set(hanja.index) & trad), len(set(hanja.index) & simp), len(set(hanja.index) & cmn), len(set(hanja.index) - (trad | simp | cmn))))

trad 2113 / simp 4592 / common 3513
kanji: trad 505 / simp 104 / common 1290 / other 237
hanja: trad 632 / simp 4 / common 1129 / other 35
