# Chinese numbers listening practice deck

Anki deck with 1000 numbers to practice recognizing them from audio. Mostly (70%) in 0-999 range, but some bigger numbers as well.

Multiple variations you could encounter in real world, e.g. 二百 / 兩百, 三百一 / 三百一十, 五百六十四 / 五六四.

Good quality neural TTS audio with a variety of voices.

Release link: https://ankiweb.net/shared/info/1692023103

-- 

This notebook generates an initial deck without audio. Use [HyperTTS](https://ankiweb.net/shared/info/111623432) addon to add some after importing to anki.

In [1]:
!pip install -q genanki

import genanki
import pandas as pd
import re
import random

SEED = 42

def to_simplified(hanzi):
    mp = {'兩': '两', '萬': '万', '億': '亿'}
    return ''.join(mp.get(c, c) for c in hanzi)

def to_traditional(hanzi):
    mp = {'两': '兩', '万': '萬', '亿': '億'}
    return ''.join(mp.get(c, c) for c in hanzi)

def to_pinyin(hanzi):
    mp = {
        '零': 'líng',
        '一': 'yī',
        '二': 'èr',
        '兩': 'liǎng',
        '两': 'liǎng',
        '三': 'sān',
        '四': 'sì',
        '五': 'wǔ',
        '六': 'liù',
        '七': 'qī',
        '八': 'bā',
        '九': 'jiǔ',
        '十': 'shí',
        '百': 'bǎi',
        '千': 'qiān',
        '萬': 'wàn',
        '億': 'yì',
    }
    return ''.join(mp[c] for c in hanzi)

In [2]:
g_hanzi = {}  # number => hanzi => float

def gen(pattern, template, sample=1.0, cnt=None):
    regex = re.compile(pattern)
    output = []

    for num in range(1000000 if '萬' in template else 10000):
        match = regex.fullmatch(str(num))
        if not match: continue

        # Try both 二 and 兩 for $'s in template
        for liang in range(2):
            hanzi = ''
            i = 1
            for c in template:
                if c == '$' and match[i] == '2' and liang:
                    hanzi += '兩'
                    i += 1
                elif c in ('#', '$'):
                    hanzi += '零一二三四五六七八九十'[int(match[i])]
                    i += 1
                else:
                    hanzi += c

            output.append((num, hanzi))

    output = list(sorted(set(output)))
    if cnt:
        sample = cnt / len(output)

    for num, hanzi in output:
        g_hanzi.setdefault(num, {})
        g_hanzi[num][hanzi] = g_hanzi[num].get(hanzi, 0.0) + sample

    print('%5d %s  %s => %.1f' % (len(output), pattern, output[len(output)//2], len(output)*sample))

gen('([0-9]|10)', '#')
gen('1([1-9])', '十#')
gen('([2-9])0', '#十')
gen('([2-9])([1-9])', '#十#')
gen('([1-9])00', '$百')
gen('([1-9])0([1-9])', '$百零#', cnt=50)
gen('([1-9])([1-9])0', '$百#', cnt=50)
gen('([1-9])([1-9])0', '$百#十', cnt=50)
gen('([1-9])([1-9])([1-9])', '$百#十#', cnt=450)
gen('([1-9])00([1-9])', '$千零#', cnt=25)
gen('([1-9])0([1-9])0', '$千零#十', cnt=25)
gen('([1-9])([1-9])00', '$千#')
gen('([1-9])([1-9])00', '$千$百', sample=.5)
gen('([1-9])([1-9])0([1-9])', '$千$百零#', cnt=25)

# Some direct digit-by-digit pronunciation samples
gen('([1-9])([0-9])([1-9])', '###', cnt=25)
gen('([1-9])([0-9])([0-9])([0-9])', '####', cnt=25)

# Mid scale 10k-1M examples with 萬 and 1-2 nonzero digits
gen('([1-9]|10)0000', '$萬', cnt=3)
gen('([1-9]|10)([1-9])000', '$萬#', cnt=5)
gen('([1-9]|10)([1-9])000', '$萬#千', cnt=5)
gen('([1-9])000([1-9])', '$萬零#', cnt=5)
gen('([1-9])00([1-9])0', '$萬零#十', cnt=5)
gen('([1-9])0([1-9])00', '$萬零#百', cnt=5)
gen('([2-9])([1-9])0000', '#十#萬', cnt=5)
gen('([2-9])0([1-9])000', '#十萬#', cnt=5)

# Large scale numbers 1M-1B: just the base unit at each scale for introduction
for num, hanzi in [(1000000, '一百萬'), (10000000, '一千萬'), (100000000, '一億'), (1000000000, '十億')]:
    g_hanzi[num] = {hanzi: 1.0}

   11 ([0-9]|10)  (5, '五') => 11.0
    9 1([1-9])  (15, '十五') => 9.0
    8 ([2-9])0  (60, '六十') => 8.0
   72 ([2-9])([1-9])  (61, '六十一') => 72.0
   10 ([1-9])00  (500, '五百') => 10.0
   90 ([1-9])0([1-9])  (501, '五百零一') => 50.0
   90 ([1-9])([1-9])0  (510, '五百一') => 50.0
   90 ([1-9])([1-9])0  (510, '五百一十') => 50.0
  810 ([1-9])([1-9])([1-9])  (511, '五百一十一') => 450.0
   90 ([1-9])00([1-9])  (5001, '五千零一') => 25.0
   90 ([1-9])0([1-9])0  (5010, '五千零一十') => 25.0
   90 ([1-9])([1-9])00  (5100, '五千一') => 90.0
   98 ([1-9])([1-9])00  (5200, '五千二百') => 49.0
  882 ([1-9])([1-9])0([1-9])  (5201, '五千二百零一') => 25.0
  810 ([1-9])([0-9])([1-9])  (551, '五五一') => 25.0
 9000 ([1-9])([0-9])([0-9])([0-9])  (5500, '五五零零') => 25.0
   11 ([1-9]|10)0000  (50000, '五萬') => 3.0
   99 ([1-9]|10)([1-9])000  (55000, '五萬五') => 5.0
   99 ([1-9]|10)([1-9])000  (55000, '五萬五千') => 5.0
   90 ([1-9])000([1-9])  (50001, '五萬零一') => 5.0
   90 ([1-9])00([1-9])0  (50010, '五萬零一十') => 5.0
   90 ([1-9])0([1-9])00  (50100, '五萬零一

In [3]:
# Sample numbers from generated lists

rng = random.Random(SEED)

df = []
for num in sorted(g_hanzi.keys()):
    for hanzi, sample in g_hanzi[num].items():
        if rng.random() < sample:
            df.append({
                'Arabic': str(num),
                'Traditional': hanzi,
                'Simplified': to_simplified(hanzi),
                'Pinyin': to_pinyin(hanzi),
                'Audio': '',  # to be filled with hypertts
            })

df = pd.DataFrame(df)
df.to_csv('numbers.csv', index=False)
print(len(df))

1000


In [4]:
model = genanki.Model(
    1693900001,
    'Numbers',
    fields=[
        {'name': 'Arabic'},
        {'name': 'Traditional'},
        {'name': 'Simplified'},
        {'name': 'Pinyin'},
        {'name': 'Audio'},
    ],
    templates=[{
        'name': 'Numbers',
        'qfmt': '{{Audio}} {{^Audio}}{{Pinyin}}{{/Audio}}',
        'afmt': '''{{FrontSide}}

<hr id=answer>

<div id="arabic">{{Arabic}}</div>
<script>
document.getElementById('arabic').innerText =
  new Intl.NumberFormat('en-US').format(
    document.getElementById('arabic').innerText);
</script>
<br>
<div>{{Traditional}}</div>
<div style="color: #9999">[{{Pinyin}}]</div>
''',
    },
  ])

deck = genanki.Deck(1693900002, name='Numbers')
for row in df.to_records(index=False):
    deck.add_note(genanki.Note(model=model, fields=row))
genanki.Package(deck).write_to_file('numbers-noaudio.apkg')

print(len(deck.notes))

# f 6200 5517

1000
