##  Whisper experiments to split audio

In [None]:
%%bash -e
if ! [[ -d downloads/ ]]; then
  if [[ -d ../downloads/modernchinese ]]; then ln -s ../downloads/modernchinese downloads; else mkdir -p downloads; fi
fi

if ! [[ -f downloads/shengzi/B4-16-2-2.mp3 ]]; then
  mkdir -p downloads/shengzi
  /bin/cp -f downloads/B?L??/B1-00-13.mp3 downloads/shengzi/
  /bin/cp -f downloads/B?L??/B?*-?-2.mp3 downloads/shengzi/
fi

In [None]:
!pip install --default-timeout=10000 openai-whisper
!pip install librosa

In [None]:
import os, re, glob, json, sys
import pandas as pd
import numpy as np
import librosa
import opencc
pd.options.display.max_rows = 1000

opencc_tw2s = opencc.OpenCC('tw2s')

mc_df = pd.read_csv('modernchinese.tsv', sep='\t')

shengzi_mp3 = list(sorted(glob.glob('downloads/shengzi/B[1234]-*-?-2.mp3')))
assert len(shengzi_mp3) == 144
shengzi_mp3 = ['downloads/shengzi/B1-00-13.mp3'] + shengzi_mp3

cn_numbers = list('零一二三四五六七八九')
for x in range(1, 10):
    for y in range(10):
        cn_numbers.append('%s十%s' % ('' if x == 1 else cn_numbers[x], '' if y == 0 else cn_numbers[y]))

In [None]:
import whisper
model = whisper.load_model("medium")

for filename in shengzi_mp3:
    json_filename = filename.replace('.mp3', '.json')
    if os.path.exists(json_filename):
        continue
    print(filename)
    res = model.transcribe(filename, language='zh')
    with open(json_filename, 'w') as f:
        json.dump(res, f)

In [None]:
import collections
h2py = collections.defaultdict(set)
py2h = collections.defaultdict(set)

for fn in ['../cedict/cedict.csv', '../dangdai/dangdai.tsv', 'modernchinese.tsv']:
    for row in pd.read_csv(fn, sep='\t' if '.tsv' in fn else ',').itertuples():
        h2py[row.Traditional].add(row.Pinyin.lower())
        h2py[row.Simplified].add(row.Pinyin.lower())
        py2h[row.Pinyin.lower()].add(row.Simplified)
        py2h[row.Pinyin.lower()].add(row.Traditional)

def find_homonyms(h):
    res = set()
    for py in h2py[h]:
        for hh in py2h[py]:
            res.add(hh)
    return res

find_homonyms('他')

In [None]:
transcription_extra = '''
三聚不离本行 三句不离本行
大兜 大都
衣衣 一一
'''.strip().split('\n')
transcription_extra = {s.split()[0]: s.split()[1:] for s in transcription_extra}
transcription_extra['刀 刀子'] = '刀（子）'
transcription_extra['叉 叉子'] = '叉（子）'

In [None]:
tot_terms = 0
tot_rec = 0

rec_terms = []

def srt_timestamp(seconds):
    milliseconds = round(seconds * 1000.0)
    hours = milliseconds // 3600000
    milliseconds -= hours * 360000
    minutes = milliseconds // 60000
    milliseconds -= minutes * 60000
    seconds = milliseconds // 1000
    milliseconds -= seconds * 1000
    return f"{hours}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

for fidx, filename in enumerate(sorted(glob.glob('downloads/shengzi/*.json'))):
    transcribed = json.load(open(filename,'r'))

    if 'B1-00-1' in filename:
        lesson_part = 'B1L00-1'
    else:
        m = re.match('B([1234])-(.*)-(.)-2.json', os.path.basename(filename))
        assert m, filename
        lesson_part = 'B%sL%.2d-%s' % (m[1], int(m[2]), m[3])
    terms_df = mc_df[mc_df.ID.str.extract('^(B.L..-.)')[0] == lesson_part]
    terms = list(terms_df.Simplified)
    term_variants = list(terms_df.Variants.fillna(''))
    print('\n%s %s %s' % (filename, len(terms), terms))

    book_idx = 0
    book_idx_pref = ''
    term_idx = -1
    book_line_tokens = 0
    book_line_matched = 0
    rec = 0

    for segment in transcribed['segments']:
        text = segment['text']
        if book_idx == 0 and len(text) == 2 and text[0] in '生声': continue

        text = text.replace('.', ' . ')
        prefixed = 0
        for skip in range(3):
            if book_idx > 0 and book_line_tokens == 0: break
            k = book_idx + 1 + skip
            if text.startswith(str(k)) or text.startswith(cn_numbers[k]):
                if book_idx > 0:
                    if book_line_matched == 0:
                        print('  [UNMATCHED; NEXT: %s]' % terms[term_idx+1:min(len(terms),term_idx+2)], end='')
                    print()
                print('%3d) %s; ' % (k, text), end='')
                book_idx = k
                book_line_tokens = 0
                book_line_matched = 0
                if text.startswith(str(k)):
                    pref = str(k)
                else:
                    pref = cn_numbers[k]
                text = text[len(pref):].strip()
                text = re.sub('^[.,。;] *', '', text)
                prefixed = 1
                break

        if text.lower() == 'tracers':
            text = 'phrases'
        if re.match('(phrases|names)', text.lower().strip()):
            print('\n%s' % text)
            book_idx = 0
            book_idx_pref = text.upper()[0]
            continue

        if text:
            book_line_tokens += 1
            cand_a = term_idx + 1
            cand_b = min(len(terms), term_idx + 6)
            variants = [text]
            variants.append(re.sub(' *[?!？！.。)]$', '', text))
            variants.append(opencc_tw2s.convert(text))
            variants.extend(transcription_extra.get(text, []))
            if term_idx+1 < len(terms):
                variants.extend(list(find_homonyms(text)))

            found = -1
            for var_i, variant in enumerate(variants):
                for j in range(cand_a, cand_b):
                    if variant == terms[j]:
                        found = j
                        break
                    if term_variants[j] != '':
                        for trad in term_variants[j].split(' / '):
                            trad = trad.split('[')[0].strip()
                            if variant == trad or variant == opencc_tw2s.convert(trad):
                                found = j
                                break
                if found >= 0:
                    break

            if found >= 0:
                term_idx = found
                print('%s{%d}' % (text, 1 + term_idx), end='; ')
                rec += 1
                book_line_matched += 1
                row = {
                    'ID': terms_df.ID.iloc[term_idx],
                    'Traditional': terms_df.Traditional.iloc[term_idx],
                    'Simplified': terms_df.Traditional.iloc[term_idx],
                    'Pinyin':  terms_df.Pinyin.iloc[term_idx],
                    'BookIndex': book_idx_pref + str(book_idx),
                    'Source': os.path.basename(filename).replace('.json', '.mp3'),
                    'Start': segment['start'],
                    'End': segment['end'],
                    'Transcribed': text,
                    'Prefix': prefixed,
                }
                rec_terms.append(row)
                segment['rec'] = f"{row['ID']} {row['Traditional']} / {row['Simplified']} [{row['Pinyin']}]"
            else:
                print(text, end='; ')
                pass
    print('')

    print('%d/%d found' % (rec, len(terms)))
    tot_terms  += len(terms)
    tot_rec += rec

    with open(filename.replace('.json', '.srt'), 'w') as fsrt:
        for i, segment in enumerate(transcribed['segments']):
            #fsrt.write(f"{i+1}\n{srt_timestamp(segment['start'])} --> {srt_timestamp(segment['end'])}\n")
            fsrt.write('%d\n%.2f --> %.2f\n' % (i+1, segment['start'], segment['end']))
            fsrt.write(f"{segment['text'].replace('-->', '->').strip()}\n")
            if 'rec' in segment:
                fsrt.write('{%s}\n' % segment['rec'])
            fsrt.write('\n')

rec_df = pd.DataFrame(rec_terms)
rec_df.to_csv('whisper.tsv', sep='\t', index=False)

In [None]:
print('Total: %d/%d found, %d missing' % (tot_rec, tot_terms, tot_terms-tot_rec))

In [None]:
MARGIN = 0.25

wave_filename = ''

for row in rec_terms:
    if row['Source'] != wave_filename:
        wave_filename = row['Source']
        wave, sr = librosa.load(f"downloads/shengzi/{row['Source']}", sr=None)

    seg = wave[int(row['Start']*sr):int(row['End']*sr)]
    adj_start = 0
    adj_end = row['End'] - row['Start']

    sil = librosa.effects.split(seg, ref=np.max(wave), frame_length=5000, hop_length=1000)
    #print(row['ID'], len(sil), 'p=',row['Prefix'], '%.2f' % row['Start'], '\t', sil.tolist())

    if row['Prefix'] == 1:
        if len(sil) == 0:
            print('%s no segments\n' % row['ID'])
        elif len(sil) == 1:
            adj_start = sil[0][0]/sr - MARGIN
            adj_end = sil[0][1]/sr + MARGIN
        else:
            adj_start = sil[1][0]/sr - MARGIN
            adj_end = sil[1][1]/sr + MARGIN
    else:
        if len(sil) > 0:
            adj_start = sil[0][0]/sr - MARGIN
            adj_end = sil[0][1]/sr + MARGIN

    row['AdjStart'] = row['Start'] + adj_start
    row['AdjEnd'] = row['Start'] + adj_end

rec_df = pd.DataFrame(rec_terms)
rec_df.to_csv('whisper.csv', index=False, float_format='%.2f')

In [None]:
import os, shutil
import pandas as pd

!mkdir -p data/audio

for row in pd.read_csv('whisper.csv', dtype='str').itertuples():
    dst = f'data/audio/modernchinese-{row.ID}.mp3'
    assert os.path.exists(f'downloads/shengzi/{row.Source}')
    cmd = f"ffmpeg -v error -i 'downloads/shengzi/{row.Source}' -ss {row.AdjStart} -to {row.AdjEnd} -c copy -vn -sn -dn -y '{dst}'"
    print(cmd)
    ret = os.system(cmd)
    assert ret == 0, cmd

In [None]:
import random

recent = []
for row in rec_terms:
    #res = model.transcribe(dst, initial_prompt='繁體中文', prepend_punctuations='', append_punctuations='', language='zh')
    recent.append(re.sub('[/（].*', '', row['Traditional']).strip())
    if len(recent) > 5:
        recent = recent[-5:]
    dst = f"data/audio/modernchinese-{row['ID']}.mp3"
    r = list(recent)
    random.shuffle(r)
    prompt = 'Glossary: ' + ', '.join(r) + '. 繁體中文'
    res = model.transcribe(dst, initial_prompt=prompt, prepend_punctuations='', append_punctuations='', language='zh')
    text = re.sub('[。？！/?!.,（）]', '', res['text'])
    row['PostTranscription2'] = text

In [None]:
k = 0
for row in rec_terms:
    if (row['PostTranscription'] == row['Traditional'] or
        row['PostTranscription'] in row['Traditional'].split('/') or
        row['PostTranscription'] == re.sub('[（） ]|/.*', '', row['Traditional']) or
        row['PostTranscription'] == re.sub('（.*）', '', row['Traditional'])
       ):
        # or row['Traditional'] in find_homonyms(row['PostTranscription']):
        row['Flagged'] = 0
    else:
        k += 1
        row['Flagged'] = 1
        #print(row['Traditional'], '\t', row['PostTranscription'])
    if 'PostTranscription2' in row:
        row.pop('PostTranscription2')

print('%d flagged' % k)

rec_df = pd.DataFrame(rec_terms)
rec_df.Simplified = list(mc_df.set_index('ID').loc[rec_df.ID].Simplified)
rec_df.to_csv('whisper.csv', index=False, float_format='%.2f')

In [None]:
pd.read_csv('whisper.csv').head()

In [None]:
audio_df = pd.concat([mc_df[['ID', 'Traditional', 'Simplified', 'Pinyin']].set_index('ID'),
                      pd.read_csv('whisper.csv').set_index('ID')[[
                          'BookIndex', 'Source', 'AdjStart', 'AdjEnd', 'PostTranscription', 'Flagged'
                      ]]], axis=1)
audio_df['OK'] = list(map(int, audio_df.Flagged.notnull() & (audio_df.Flagged == 0)))
audio_df = audio_df.rename(columns={'AdjStart': 'Start', 'AdjEnd': 'End', 'PostTranscription': 'Transcribed'}).drop(columns=['Flagged'])
pp = audio_df.index.str.extract('(B.L..-[0-9])')[0]
audio_df['Source'] = list(pp.map(audio_df.reset_index().groupby(pp).Source.first().to_dict()))
audio_df.to_csv('audio.csv')
audio_df

In [None]:
anki_flagged=list(sorted(set(pd.read_csv('sel.txt', sep='\t', comment='#',header=None)[0])))
len(anki_flagged)
audio_df.loc[anki_flagged][lambda X: X.OK==1]

In [None]:
sum(audio_df.OK == 0)

In [None]:
import os, pandas as pd

!mkdir -p data/audio
for row in pd.read_csv('audio.csv', dtype='str').fillna('').itertuples():
    dst = f'data/audio/modernchinese-{row.ID}.mp3'
    assert os.path.exists(f'downloads/shengzi/{row.Source}')
    if row.Start:
        cmd = f"ffmpeg -v error -i 'downloads/shengzi/{row.Source}' -ss {row.Start} -to {row.End} -c copy -vn -sn -dn -y '{dst}'"
        assert os.system(cmd) == 0, cmd

## Whisper session 2

In [None]:
# Widen segments

MARGIN = 0.25
wave_filename = ''

audio_df = pd.read_csv('audio.csv', dtype='str').set_index('ID')

for row in audio_df.reset_index().to_dict(orient='records'):
    if row['OK'] != '1':
        continue

    if row['Source'] != wave_filename:
        wave_filename = row['Source']
        print(wave_filename)
        wave, sr = librosa.load(f"downloads/shengzi/{row['Source']}", sr=None)
        splits = librosa.effects.split(wave, frame_length=5000, hop_length=1000).tolist()

    t1, t2 = float(row['Start']), float(row['End'])
    t1, t2 = int(t1*sr), int(t2*sr)
    match = [s for s in splits if t1 <= s[0] <= t2 or t1 <= s[1] <= t2 or s[0] <= t1 <= s[1] or s[0] <= t2 <= s[1]]
    if len(match) != 1:
        print('Multiple splits: ', row, t1, t2, match)

    a = '%.2f' % max(0.0, match[0][0]/sr - MARGIN)
    b = '%.2f' % max(0.0, match[-1][1]/sr + MARGIN)

    if abs(float(row['Start']) - float(a)) > 0.1 or abs(float(row['End']) - float(b)) > 0.1:
        print(row['ID'], row['Traditional'], row['Pinyin'], row['Start'], row['End'], '-->', a, b)
        audio_df.loc[row['ID'], 'Start'] = a
        audio_df.loc[row['ID'], 'End'] = b
        audio_df.loc[row['ID'], 'OK'] = '2'

audio_df.to_csv('audio2.csv')

In [None]:
audio_df = pd.read_csv('audio.csv').set_index('ID')

records = audio_df.reset_index().to_dict(orient='records')
id_to_range = {}

for i, row in enumerate(records):
    t1 = 0
    j = i
    while j > 0 and records[j-1]['Source'] == records[j]['Source']:
        j -= 1
        if records[j]['OK'] == 1:
            t1 = records[j]['End'] + 0.1
            break

    t2 = 9999
    j = i
    while j+1 < len(records) and records[j+1]['Source'] == records[j]['Source']:
        j += 1
        if records[j]['OK'] == 1:
            t2 = records[j]['Start'] - 0.1
            break

    id_to_range[row['ID']] = (t1, t2)

!mkdir -p downloads/shengzi-splits
splits_mp = {}

In [None]:
audio_df = pd.read_csv('audio.csv').set_index('ID')

MARGIN = 0.25
wave_filename = ''

for row in audio_df.reset_index().to_dict(orient='records'):
    if row['OK'] != 0:
        continue

    if row['Source'] != wave_filename:
        wave_filename = row['Source']
        wave, sr = librosa.load(f"downloads/shengzi/{row['Source']}", sr=None)
        splits = librosa.effects.split(wave, top_db=40, frame_length=2000, hop_length=1000).tolist()

    t1, t2 = id_to_range[row['ID']]
    t1 -= 2
    t2 += 30
    t1, t2 = int(t1*sr), int(t2*sr)
    cand_splits = [s for s in splits if t1 <= s[0] <= t2 or t1 <= s[1] <= t2 or s[0] <= t1 <= s[1] or s[0] <= t2 <= s[1]]

    for a, b in cand_splits:
        a = '%.2f' % max(0.0, a/sr - MARGIN)
        b = '%.2f' % max(0.0, b/sr + MARGIN)
        dst = f"downloads/shengzi-splits/{row['Source']}:{a}:{b}.mp3"
        if not os.path.exists(dst):
            cmd = f"ffmpeg -v error -i 'downloads/shengzi/{row['Source']}' -ss {a} -to {b} -c copy -vn -sn -dn -y '{dst}'"
            #print(cmd)
            assert os.system(cmd) == 0, cmd

        if dst not in splits_mp:
            splits_mp[dst] = {'Split': dst, 'Source': row['Source'], 'a': a, 'b': b, 'ids': []}
        splits_mp[dst]['ids'].append(row['ID'])

In [None]:
import random
mc_df = pd.read_csv('modernchinese.tsv', sep='\t').set_index('ID').fillna('')

for split_filename, split in splits_mp.items():
    if type(split['ids']) is list:
        split['ids'] = ' '.join(split['ids'])

for split_filename, split in splits_mp.items():
    if 'Transcribed' in split:
        continue

    terms = []
    for id in split['ids'].split():
        if mc_df.loc[id, 'Variants']:
            for variant in mc_df.loc[id, 'Variants'].split(' / '):
                terms.append(variant.split()[0])
        else:
            terms.append(mc_df.loc[id, 'Traditional'])
    random.shuffle(terms)
    prompt = 'Glossary: ' + ', '.join(terms) + '. 繁體中文'
    res = model.transcribe(split_filename, initial_prompt=prompt, prepend_punctuations='', append_punctuations='', language='zh')
    text = re.sub('[。？！/?!.,（）]', '', res['text'])
    print(split, prompt); print(text)
    split['Transcribed'] = text
    pd.DataFrame(list(splits_mp.values())).to_csv('whisper-splits.csv', index=False)

In [None]:
audio_df = pd.read_csv('audio.csv', dtype='str').fillna('').set_index('ID')
mc_df = pd.read_csv('modernchinese.tsv', sep='\t').fillna('').set_index('ID')

for split_filename in sorted(splits_mp.keys()):
    split = splits_mp[split_filename]
    for term_id in split['ids'].split():
        if audio_df.loc[term_id, 'OK'] == '0' and \
           (split['Transcribed'] == audio_df.loc[term_id, 'Traditional'] or  \
            ' '+split['Transcribed']+' ' in ' '+mc_df.loc[term_id, 'Variants']+' '):
            audio_df.loc[term_id, 'Transcribed'] = split['Transcribed']
            audio_df.loc[term_id, 'Start'] = split['a']
            audio_df.loc[term_id, 'End'] = split['b']
            assert audio_df.loc[term_id, 'Source'] == split['Source']
            audio_df.loc[term_id, 'OK'] = '3'

audio_df.to_csv('audio.csv')
len(audio_df[audio_df.OK == '0'])