## Cloze deck from example sentences

Cloze deck based on the example sentences. Each card has a sentence with the term being introduced in cloze, full sentence audio and an option to reveal hint with the meaning of the missing word.

In [1]:
import os, re, glob, random, json
import pandas as pd
import genanki

def norm_text(text):
    return re.sub('[ ？。，！.,]|[AB]:', '', text)

hypertts_df = pd.read_csv('../Anki2/hypertts.tsv', sep='\t')
hypertts_mp = hypertts_df.assign(idx=hypertts_df.index).groupby(hypertts_df.Text.apply(norm_text)).idx.apply(list)
hypertts_dir = '../Anki2/tts/collection.media'
rnd = random.Random(42)

def tts_file_for_text(text):
    ind = hypertts_mp.get(norm_text(text), [])
    files = ["%s/hypertts-%s.mp3" % (hypertts_dir, hypertts_df.loc[i, 'Hash']) for i in ind]
    files = [s for s in files if os.path.exists(s)]
    return rnd.choice(files) if files else None

#tts_file_for_text('這隻小鳥會飛嗎？')

In [3]:
mc_df = pd.read_csv('modernchinese.csv', dtype='str').fillna('')
cloze_media = ['data/media/_MoeStandardKai.ttf']
text_len = 0
rows = []

for row in mc_df.itertuples():
    if not row.Examples:
        continue

    row_examples = row.Examples.split('<br>')
    for ex_i, ex in enumerate(row_examples):
        ex = ex.strip()
        masked = set([row.Traditional])
        variants = []
        if row.Variants:
            variants = json.loads(row.Variants)
            masked = set([v[0] for v in variants])
        found = any(s in ex for s in masked)
        if not found and '-sep' in row.POS and len(variants) <= 1:
            a, b = row.Traditional[0], row.Traditional[1:]
            if ex.count(a) == 1 and ex.count(b) == 1 and ex.index(a) < ex.index(b):
                masked = [a, b]
                found = 2
        if not found:
            continue

        text_len += len(ex)

        clozed = re.sub('(%s)' % '|'.join(sorted(masked, key=lambda s: -len(s))), '{{c1::\\1}}', ex)

        ex_id = row.ID
        if len(row_examples) != 1:
            ex_id += 'ABCDE'[ex_i]

        tts_text = re.sub(' *[AB]: *', '', ex)
        audio_file = tts_file_for_text(tts_text)
        if audio_file:
            cloze_media.append(audio_file)
            audio = '[sound:%s]' % os.path.basename(audio_file)
        else:
            audio = ''

        rows.append({
            'ID': ex_id,
            'Text': clozed,
            'Traditional': row.Traditional,
            'Pinyin': row.Pinyin,
            'POS': row.POS,
            'Meaning': row.Meaning,
            'Audio': audio,
        })

cloze_df = pd.DataFrame(rows)
cloze_df.to_csv('cloze.csv', index=False)
print('%d examples, %d chars total' % (len(cloze_df), text_len))

cols = ['ID', 'Text', 'Traditional', 'Pinyin', 'POS', 'Meaning', 'Audio']

cloze_model = genanki.Model(
    1696565464,
    name='ModernChineseCloze',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'ModernChineseCloze',
        'qfmt': '''
<div class="hanzi" style="font-size: 35px">{{cloze:Text}}</div>
<br>
<div>{{Audio}}</div>
<br>
<div>{{hint::Meaning}}</div>
<br>
<div>{{type:cloze:Text}}</div>
'''.strip(),
        'afmt': '''
<div class="hanzi" style="font-size: 35px">{{cloze:Text}}</div>
<br>
<div>{{Audio}}</div>
<br>
<div>{{type:cloze:Text}}</div>
<hr>
<div>{{Traditional}} [{{Pinyin}}]</div>
<div>{{#POS}}({{POS}}) {{/POS}}{{Meaning}}
'''.strip()
    }],
    css=open('../dangdai/dangdai.css').read() + '''
.cloze {
 color: blue;
}
.nightMode .cloze {
 color: lightblue;
}
''',
    model_type=1,  # cloze
)

cloze_deck = genanki.Deck(1696565465, name='modernchinese-cloze', description='Modern Chinese examples cloze deck')

for row in cloze_df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=cloze_model,
        fields=[row[c] for c in cols],
        guid=genanki.guid_for('modernchinese-cloze', row['ID'])
    )
    cloze_deck.add_note(note)

!rm -f cloze.apkg
genanki.Package(cloze_deck, media_files=cloze_media).write_to_file('cloze.apkg')

3217 examples, 77273 chars total
