In [1]:
import os
from os.path import join
import numpy as np
import pandas as pd
import jieba
from bopomofo.main import trans_sentense

np.random.seed(42)
DATA_DIR = '/home/jiazhi/Dataset/common-voice_zh-TW_43h_2019-06-12'

In [2]:
def kaldi_gender(gender):
    ''' Alter CommonVoice gender into kaldi format. '''
    if gender == 'male':
        return 'm'
    elif gender == 'female':
        return 'f'
    else:
        return 'm'
    
def path2utt(row):
    ''' Convert audio path into utterance id. '''
    utt_id = row.path[:-4].split('_')[-1]
    prefix = row.spk_id
    return f'{prefix}_{utt_id}'

def is_chinese(char):
    ''' Check if character is chinese. '''
    return u'\u4e00' <= char <= u'\u9fff'

def fix_char(sent):
    ''' Fix unusual chinese characters. '''
    sent = sent.replace('内', '內')
    sent = sent.replace('爲', '為')
    sent = sent.replace('柺', '拐')
    sent = sent.replace('庄', '莊')
    sent = sent.replace('麽', '麼')
    sent = sent.replace('污', '汙')
    sent = sent.replace('値', '值')
    return sent
    
def jieba_cut(sent):
    ''' Chinese segmentation with punctuations removed. '''
    return [c for c in jieba.cut(sent) if is_chinese(c)]

def contains_no_eng(text):
    ''' Check if text contains no english. '''
    for char in text:
        if 'a' <= char <= 'z' or \
           'A' <= char <= 'Z' or \
           u'\uff21' <= char <= u'\uff3a' or \
           u'\uff41' <= char <= u'\uff5a':
            return False
    return True

def zhuyin2phones(zhuyin, use_tone, sep):
    ''' Convert zhuyin of a charcter to phonemes. '''
    if any(is_chinese(c) for c in zhuyin):
        return zhuyin
    if zhuyin[0] == u'\u02d9':  # Neutral(fifth) tone 
        phones = sep.join([c for c in zhuyin][1:])
        tone = zhuyin[0]
    else:
        phones = sep.join([c for c in zhuyin][:-1])
        tone = zhuyin[-1]
    if use_tone:
        phones = f'{phones}{tone}'
    return phones

def fix_phones(phones, use_tone):
    ''' Fix broken phonemes. '''
    phones = phones.replace('一', 'ㄧ')
    if use_tone:
        phones = phones.replace('勳', 'ㄒ ㄩ ㄣ-')
        phones = phones.replace('艷', 'ㄧ ㄢˋ')
        phones = phones.replace('曬', 'ㄕ ㄞˋ')
    else:
        phones = phones.replace('勳', 'ㄒ ㄩ ㄣ')
        phones = phones.replace('艷', 'ㄧ ㄢ')
        phones = phones.replace('曬', 'ㄕ ㄞ')
    return phones

def word2phones(word, use_tone, sep=' '):
    ''' Convert a chinese word to zhuyin phonemes. '''
    if word == '曬':  # special case
        if use_tone:
            return 'ㄕ ㄞˋ'
        else:
            return 'ㄕ ㄞ'
    zhuyins = trans_sentense(word).split()
    phones = ' '.join([zhuyin2phones(z, use_tone, sep) for z in zhuyins])
    phones = fix_phones(phones, use_tone)
    return phones

def sent2phones(cut_sent, use_tone):
    ''' Convert a segmented sentence into phonemes. '''
    cut_phones, pos = [], 0
    sent = ''.join(cut_sent)
    phones = word2phones(sent, use_tone, sep='').split()
    for word in cut_sent:
        word_len = len(word)
        word_phones = ' '.join(phones[pos:pos + word_len])
        cut_phones.append(word_phones)
        pos += word_len
    return cut_phones

In [3]:
# Create directory structures
try:
    os.mkdir('../data')
    os.mkdir('../data/train')
    os.mkdir('../data/test')
    os.mkdir('../data/local')
    os.mkdir('../data/local/dict')
except:
    pass

In [4]:
# Configuration
use_tone = False
jieba.set_dictionary('dict.txt.big')
jieba.initialize()

Building prefix dict from /home/jiazhi/Workshop/common-voice-zh-tw/scripts/dict.txt.big ...
Loading model from cache /tmp/jieba.ufe63d437a4894b8c7d3a0a0158031718.cache
Loading model cost 0.882 seconds.
Prefix dict has been built succesfully.


In [5]:
# Read information of validated audios
full_tsv = join(DATA_DIR, 'validated.tsv')
full_df = pd.read_csv(full_tsv, sep='\t')

# Exclude audios with english
full_df = full_df[full_df.sentence.apply(contains_no_eng)]

In [6]:
print(f'full_df.shape: {full_df.shape}')
full_df.head()

full_df.shape: (40608, 8)


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0d70b77b4bfb288e0414fe7ee3b1b01a299a0c28a5b321...,common_voice_zh-TW_18500863.mp3,在黑暗中進行,2,0,,,
1,27bfe1c6426a4c18981b1d0628978660d11c8e8b4f3a4c...,common_voice_zh-TW_17850420.mp3,報名費太貴了,2,1,,,
2,29538175d3522a88a07183782b856ed7aff10011ff1706...,common_voice_zh-TW_18386149.mp3,不然肯定骨折,2,0,,,
3,2d29001b46ce5d4e81355bec8fa3fc9b54e7ac47c655a8...,common_voice_zh-TW_17636629.mp3,他需要是自己決定要去哪裡,2,0,,,
4,31e95e153e298d4f555648ea62d1032cf077801bd1f31c...,common_voice_zh-TW_18501281.mp3,我曾經擁有過,2,0,,,


# Acoustic Model

In [7]:
%%time
# Prepare spk_id, gender and utt_id for all audios
client_spk = full_df[['client_id']].drop_duplicates()
client_spk['spk_id'] = range(1, 1 + len(client_spk))
client_spk.spk_id = client_spk.spk_id.apply(lambda x: str(x).zfill(3))
full_df = full_df.merge(client_spk)
full_df.gender = full_df.gender.apply(kaldi_gender)
full_df['utt_id'] = full_df.apply(path2utt, axis=1)

# Fix unusual character and do text segmentation
full_df.sentence = full_df.sentence.apply(fix_char)
full_df.sentence = full_df.sentence.apply(jieba_cut)

# Drop useless columns
drop_columns = ['client_id', 'up_votes', 'down_votes', 'age', 'accent']
full_df.drop(columns=drop_columns, inplace=True)

CPU times: user 2.24 s, sys: 24.8 ms, total: 2.27 s
Wall time: 2.28 s


In [8]:
# Split processed dataset to train/test set
train_idx = np.random.rand(len(full_df)) < 0.8
train_df = full_df[train_idx]
test_df = full_df[~train_idx]

# Sort train/test set by utt_id for kaldi-mfcc
train_df = train_df.sort_values(by='utt_id')
test_df = test_df.sort_values(by='utt_id')

In [9]:
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape:  {test_df.shape}')
test_df.head()

train_df.shape: (32523, 5)
test_df.shape:  (8085, 5)


Unnamed: 0,path,sentence,gender,spk_id,utt_id
1,common_voice_zh-TW_17850420.mp3,"[報名費, 太貴, 了]",m,2,002_17850420
7,common_voice_zh-TW_17585443.mp3,"[講座, 間, 禁止, 錄音, 錄影]",m,8,008_17585443
11,common_voice_zh-TW_18500712.mp3,"[勞動, 底層]",m,12,012_18500712
12,common_voice_zh-TW_17580966.mp3,"[畢竟, 他們, 是, 借住, 在, 別人, 屋簷, 之下]",m,13,013_17580966
33,common_voice_zh-TW_17589393.mp3,"[我們, 就, 沒, 辦法, 改善]",m,31,031_17589393


### Write files for kaldi usage

In [10]:
%%time
# Write acoustic data of train/test set
for split, df in zip(['train', 'test'], [train_df, test_df]):
    
    # spk2gender
    spk2gender = df[['spk_id', 'gender']].drop_duplicates()
    spk2gender = spk2gender.sort_values(by='spk_id')
    with open(join('../data', split, 'spk2gender'), 'w', encoding='UTF-8') as f:
        for _, row in spk2gender.iterrows():
            f.write(f'{row.spk_id} {row.gender}\n')
            
    # wav.scp
    with open(join('../data', split, 'wav.scp'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            mp3_path = join(DATA_DIR, 'clips', row.path)
            f.write(f'{row.utt_id} sox {mp3_path} -t wav -r 16000 - |\n')
    
    # text
    with open(join('../data', split, 'text'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            text = ' '.join(row.sentence)
            f.write(f'{row.utt_id} {text}\n')
            
    # utt2spk
    with open(join('../data', split, 'utt2spk'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            f.write(f'{row.utt_id} {row.spk_id}\n')

CPU times: user 11.9 s, sys: 8.35 ms, total: 11.9 s
Wall time: 11.9 s


# Language Model

In [11]:
# Build corpus to train language model
corpus = full_df.sentence.apply(' '.join).tolist()

# Build lexicon with zhuyin package
sents = full_df.sentence.tolist()
sents_phones = [sent2phones(sent, use_tone) for sent in sents]
lexicon = set()
for sent, sent_phones in zip(sents, sents_phones):
    for i, zhuyins in enumerate(sent_phones):
        phonemes = []
        for zhuyin in zhuyins.split():
            if use_tone:
                phones = [c for c in zhuyin[:-2]]
                phones.append(zhuyin[-2:])
            else:
                phones = [c for c in zhuyin]
            phones = ' '.join(phones)
            phonemes.append(phones)
        phonemes = ' '.join(phonemes)
        sent_phones[i] = phonemes
    lexicon.update(zip(sent, sent_phones))
    
# Build phone set from lexicon
phone_set = set()
for _, phones in lexicon:
    phone_set.update(phones.split())

In [12]:
corpus

['在 黑暗 中 進行',
 '報名費 太貴 了',
 '不然 肯定 骨折',
 '他 需要 是 自己 決定 要 去 哪裡',
 '我 曾經 擁有 過',
 '他們 聚在一起',
 '再用 補助',
 '講座 間 禁止 錄音 錄影',
 '不 做 又 不行',
 '你 慢慢 看',
 '繞回 公車站',
 '勞動 底層',
 '畢竟 他們 是 借住 在 別人 屋簷 之下',
 '留職停薪 復職 者',
 '有 若干 事項 需要 討論',
 '在 報告書 中 加以 說明',
 '妳 回來 了',
 '昨日 回報',
 '叫做 天佑 花蓮',
 '其他人 看到 你',
 '從 第一座 開始 慢慢 努力',
 '現在 提出 二零 五零 的 計畫',
 '最 簡單 的 方式 就是',
 '中午 都 逛 完 了 才 找到',
 '黃土 水 畢業 自 師範 科',
 '待會 去 吃 冰',
 '我 沒有 想過',
 '小 的 果然 太弱',
 '最後 還說 謝謝 大家 的 配合',
 '一直 到 了 前兩天',
 '不料 好景不常',
 '修理 好會 再 出來',
 '近三年 離職 率 一直 在 降',
 '我們 就 沒 辦法 改善',
 '純粹 休息 時間 一到 就 出發',
 '傻 啊 擔心 什麼',
 '修理 好會 再 出來',
 '難道 以後 我 出門',
 '面對 迷惘 時 重要 的 課題',
 '看見 台灣 近代 美術 的 啟蒙',
 '這就 叫 人 為 因素 操作 不當',
 '請出 示 身份證 或 學生證',
 '柿子 也 不錯',
 '下次 給我 注意 一點',
 '差異 並 沒有 大到 你 想像 的 那樣',
 '一棟 房 不過 十二三萬',
 '先 在 遊客 中心 休息',
 '是 金牌 熊 讚 還是 海洋 熊 讚',
 '我 已經 找 你們 找 了 好久',
 '藍蔭鼎出 生於 宜蘭 羅東',
 '好好 的 來 檢查 你 的 身體',
 '我 在 找 飲水機',
 '回家 後 才 開始 煮飯',
 '看到 有 摩斯',
 '正巧 母親 往外 探頭',
 '開到 站 有 位子 坐',
 '我 沒 辦法 相信 你',
 '你 是 什麼 人',
 '也許 是 因為 後悔',
 '返校日 真的 拍得 很 有趣',
 '在 病床 上 劇烈 扭曲 著',
 '注意 

In [13]:
lexicon

{('搶人', 'ㄑ ㄧ ㄤ- ㄖ ㄣˊ'),
 ('撿', 'ㄐ ㄧ ㄢˇ'),
 ('迷惘', 'ㄇ ㄧˊ ㄨ ㄤˇ'),
 ('蟲叫', 'ㄔ ㄨ ㄥˊ ㄐ ㄧ ㄠˋ'),
 ('重返', 'ㄔ ㄨ ㄥˊ ㄈ ㄢˇ'),
 ('想成', 'ㄒ ㄧ ㄤˇ ㄔ ㄥˊ'),
 ('功勞', 'ㄍ ㄨ ㄥ- ㄌ ㄠˊ'),
 ('這裡會', 'ㄓ ㄜˋ ㄌ ㄧˇ ㄏ ㄨ ㄟˋ'),
 ('淒涼', 'ㄑ ㄧ- ㄌ ㄧ ㄤˊ'),
 ('過不去', 'ㄍ ㄨ ㄛˋ ㄅ ㄨˋ ㄑ ㄩˋ'),
 ('一台', 'ㄧ- ㄊ ㄞ-'),
 ('講了', 'ㄐ ㄧ ㄤˇ ㄌ ㄧ ㄠˇ'),
 ('信', 'ㄒ ㄧ ㄣˋ'),
 ('自造', 'ㄗˋ ㄘ ㄠˋ'),
 ('伴', 'ㄅ ㄢˋ'),
 ('時程', 'ㄕˊ ㄔ ㄥˊ'),
 ('河流', 'ㄏ ㄜˊ ㄌ ㄧ ㄡˊ'),
 ('轉化', 'ㄓ ㄨ ㄢˇ ㄏ ㄨ ㄚˋ'),
 ('而已', 'ㄦˊ ㄧˇ'),
 ('真得', 'ㄓ ㄣ- ㄉ ㄟˇ'),
 ('相當於', 'ㄒ ㄧ ㄤ- ㄉ ㄤ- ㄩˊ'),
 ('安排', 'ㄢ- ㄆ ㄞˊ'),
 ('馬上', 'ㄇ ㄚˇ ㄕ ㄤˋ'),
 ('衣角', 'ㄧ- ㄐ ㄧ ㄠˇ'),
 ('料理', 'ㄌ ㄧ ㄠˋ ㄌ ㄧˇ'),
 ('累', 'ㄌ ㄟˋ'),
 ('書局', 'ㄕ ㄨ- ㄐ ㄩˊ'),
 ('台中市', 'ㄊ ㄞ- ㄓ ㄨ ㄥ- ㄕˋ'),
 ('講講', 'ㄐ ㄧ ㄤˇ ㄐ ㄧ ㄤˇ'),
 ('逃脫', 'ㄊ ㄠˊ ㄊ ㄨ ㄛ-'),
 ('酷航', 'ㄎ ㄨˋ ㄏ ㄤˊ'),
 ('長', 'ㄔ ㄤˊ'),
 ('北洋', 'ㄅ ㄟˇ ㄧ ㄤˊ'),
 ('生物科技', 'ㄕ ㄥ- ㄨˋ ㄎ ㄜ- ㄐ ㄧˋ'),
 ('於是', 'ㄩˊ ㄕˋ'),
 ('關機', 'ㄍ ㄨ ㄢ- ㄐ ㄧ-'),
 ('法官', 'ㄈ ㄚˇ ㄍ ㄨ ㄢ-'),
 ('告訴', 'ㄍ ㄠˋ ㄙ ㄨˋ'),
 ('母親', 'ㄇ ㄨˇ ㄑ ㄧ ㄣ-'),
 ('萬和路', 'ㄨ ㄢˋ ㄏ ㄜˊ ㄌ ㄨˋ'),
 ('飲食', 'ㄧ ㄣˇ ㄕˊ'),
 ('明白', 'ㄇ ㄧ ㄥˊ ㄅ ㄞ˙'),
 ('內涵', 'ㄋ ㄟˋ ㄏ ㄢˊ'),
 ('不像', 'ㄅ ㄨˋ 

In [14]:
phone_set

{'ㄅ',
 'ㄆ',
 'ㄇ',
 'ㄈ',
 'ㄉ',
 'ㄊ',
 'ㄋ',
 'ㄌ',
 'ㄍ',
 'ㄎ',
 'ㄏ',
 'ㄐ',
 'ㄑ',
 'ㄒ',
 'ㄓ',
 'ㄓ-',
 'ㄓˇ',
 'ㄓˊ',
 'ㄓˋ',
 'ㄔ',
 'ㄔ-',
 'ㄔˇ',
 'ㄔˊ',
 'ㄔˋ',
 'ㄕ',
 'ㄕ-',
 'ㄕˇ',
 'ㄕˊ',
 'ㄕˋ',
 'ㄕ˙',
 'ㄖ',
 'ㄖˋ',
 'ㄗ',
 'ㄗ-',
 'ㄗˇ',
 'ㄗˋ',
 'ㄗ˙',
 'ㄘ',
 'ㄘˇ',
 'ㄘˊ',
 'ㄘˋ',
 'ㄙ',
 'ㄙ-',
 'ㄙˇ',
 'ㄙˋ',
 'ㄙ˙',
 'ㄚ-',
 'ㄚˇ',
 'ㄚˊ',
 'ㄚˋ',
 'ㄚ˙',
 'ㄛ-',
 'ㄛˇ',
 'ㄛˊ',
 'ㄛˋ',
 'ㄛ˙',
 'ㄜ-',
 'ㄜˇ',
 'ㄜˊ',
 'ㄜˋ',
 'ㄜ˙',
 'ㄝ-',
 'ㄝˇ',
 'ㄝˊ',
 'ㄝˋ',
 'ㄝ˙',
 'ㄞ-',
 'ㄞˇ',
 'ㄞˊ',
 'ㄞˋ',
 'ㄞ˙',
 'ㄟ-',
 'ㄟˇ',
 'ㄟˊ',
 'ㄟˋ',
 'ㄟ˙',
 'ㄠ-',
 'ㄠˇ',
 'ㄠˊ',
 'ㄠˋ',
 'ㄠ˙',
 'ㄡ-',
 'ㄡˇ',
 'ㄡˊ',
 'ㄡˋ',
 'ㄡ˙',
 'ㄢ-',
 'ㄢˇ',
 'ㄢˊ',
 'ㄢˋ',
 'ㄢ˙',
 'ㄣ-',
 'ㄣˇ',
 'ㄣˊ',
 'ㄣˋ',
 'ㄣ˙',
 'ㄤ-',
 'ㄤˇ',
 'ㄤˊ',
 'ㄤˋ',
 'ㄤ˙',
 'ㄥ-',
 'ㄥˇ',
 'ㄥˊ',
 'ㄥˋ',
 'ㄥ˙',
 'ㄦˇ',
 'ㄦˊ',
 'ㄦˋ',
 'ㄧ',
 'ㄧ-',
 'ㄧˇ',
 'ㄧˊ',
 'ㄧˋ',
 'ㄧ˙',
 'ㄨ',
 'ㄨ-',
 'ㄨˇ',
 'ㄨˊ',
 'ㄨˋ',
 'ㄨ˙',
 'ㄩ',
 'ㄩ-',
 'ㄩˇ',
 'ㄩˊ',
 'ㄩˋ',
 'ㄩ˙'}

### Write for kaldi usage

In [15]:
# corpus.txt
with open('../data/local/corpus.txt', 'w', encoding='UTF-8') as f:
    for sent in corpus:
        f.write(f'{sent}\n')
        
# lexicon.txt
with open('../data/local/dict/lexicon.txt', 'w', encoding='UTF-8') as f:
    f.write('!SIL sil\n')
    f.write('<UNK> spn\n')
    for word, phones in lexicon:
        f.write(f'{word} {phones}\n')
        
# nonsilence_phones.txt
with open('../data/local/dict/nonsilence_phones.txt', 'w', encoding='UTF-8') as f:
    for phone in phone_set:
        f.write(f'{phone}\n')

# silence_phones.txt
with open('../data/local/dict/silence_phones.txt', 'w', encoding='UTF-8') as f:
    f.write('sil\nspn\n')
    
# optional_silence.txt
with open('../data/local/dict/optional_silence.txt', 'w', encoding='UTF-8') as f:
    f.write('sil\n')