In [1]:
import os
from os.path import join
import pandas as pd
import jieba
from bopomofo.main import trans_sentense

DATA_DIR = '/home/jiazhi/Dataset/common-voice_zh-TW_43h_2019-06-12'

In [2]:
def kaldi_gender(gender):
    ''' Alter CommonVoice gender into kaldi format. '''
    if gender == 'male':
        return 'm'
    elif gender == 'female':
        return 'f'
    else:
        return 'm'
    
def path2utt(row):
    ''' Convert audio path into utterance id. '''
    utt_id = row.path[:-4].split('_')[-1]
    prefix = row.spk_id
    return f'{prefix}_{utt_id}'

def is_chinese(char):
    ''' Check if character is chinese. '''
    return u'\u4e00' <= char <= u'\u9fff'

def fix_char(sent):
    ''' Fix unusual chinese characters. '''
    sent = sent.replace('内', '內')
    sent = sent.replace('爲', '為')
    sent = sent.replace('柺', '拐')
    sent = sent.replace('庄', '莊')
    sent = sent.replace('麽', '麼')
    sent = sent.replace('污', '汙')
    sent = sent.replace('値', '值')
    return sent
    
def jieba_cut(sent):
    ''' Chinese segmentation with punctuations removed. '''
    return [c for c in jieba.cut(sent) if is_chinese(c)]

def contains_no_eng(text):
    ''' Check if text contains no english. '''
    for char in text:
        if 'a' <= char <= 'z' or \
           'A' <= char <= 'Z' or \
           u'\uff21' <= char <= u'\uff3a' or \
           u'\uff41' <= char <= u'\uff5a':
            return False
    return True

def zhuyin2phones(zhuyin, use_tone, sep):
    ''' Convert zhuyin of a charcter to phonemes. '''
    if any(is_chinese(c) for c in zhuyin):
        return zhuyin
    if zhuyin[0] == u'\u02d9':  # Neutral(fifth) tone 
        phones = sep.join([c for c in zhuyin][1:])
        tone = zhuyin[0]
    else:
        phones = sep.join([c for c in zhuyin][:-1])
        tone = zhuyin[-1]
    if use_tone:
        phones = f'{phones}{tone}'
    return phones

def fix_phones(phones, use_tone):
    ''' Fix broken phonemes. '''
    phones = phones.replace('一', 'ㄧ')
    if use_tone:
        phones = phones.replace('勳', 'ㄒ ㄩ ㄣ-')
        phones = phones.replace('艷', 'ㄧ ㄢˋ')
        phones = phones.replace('曬', 'ㄕ ㄞˋ')
    else:
        phones = phones.replace('勳', 'ㄒ ㄩ ㄣ')
        phones = phones.replace('艷', 'ㄧ ㄢ')
        phones = phones.replace('曬', 'ㄕ ㄞ')
    return phones

def word2phones(word, use_tone, sep=' '):
    ''' Convert a chinese word to zhuyin phonemes. '''
    if word == '曬':  # special case
        if use_tone:
            return 'ㄕ ㄞˋ'
        else:
            return 'ㄕ ㄞ'
    zhuyins = trans_sentense(word).split()
    phones = ' '.join([zhuyin2phones(z, use_tone, sep) for z in zhuyins])
    phones = fix_phones(phones, use_tone)
    return phones

def sent2phones(cut_sent, use_tone):
    ''' Convert a segmented sentence into phonemes. '''
    cut_phones, pos = [], 0
    sent = ''.join(cut_sent)
    phones = word2phones(sent, use_tone, sep='').split()
    for word in cut_sent:
        word_len = len(word)
        word_phones = ' '.join(phones[pos:pos + word_len])
        cut_phones.append(word_phones)
        pos += word_len
    return cut_phones

In [3]:
# Create directory structures
try:
    os.mkdir('../data')
    os.mkdir('../data/train')
    os.mkdir('../data/test')
    os.mkdir('../data/local')
    os.mkdir('../data/local/dict')
except:
    pass

In [4]:
# Configuration
use_tone = False
jieba.set_dictionary('dict.txt.big')
jieba.initialize()

Building prefix dict from /home/jiazhi/Workshop/common-voice-zh-tw/scripts/dict.txt.big ...
Loading model from cache /tmp/jieba.ufe63d437a4894b8c7d3a0a0158031718.cache
Loading model cost 0.843 seconds.
Prefix dict has been built succesfully.


In [5]:
# Read and merge information of train/test set
train_tsv = join(DATA_DIR, 'train.tsv')
test_tsv = join(DATA_DIR, 'test.tsv')
train_df = pd.read_csv(train_tsv, sep='\t')
test_df = pd.read_csv(test_tsv, sep='\t')

# Exclude audios with english
train_df = train_df[train_df.sentence.apply(contains_no_eng)]
test_df = test_df[test_df.sentence.apply(contains_no_eng)]
full_df = pd.concat([train_df, test_df])

In [6]:
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape:  {test_df.shape}')
train_df.head()

train_df.shape: (1697, 8)
test_df.shape:  (1535, 8)


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377831.mp3,我們特別回鄉下,2,0,thirties,male,
1,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377841.mp3,是歷史上的第二次,2,0,thirties,male,
2,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377844.mp3,簡單來說,2,1,thirties,male,
3,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377846.mp3,在田裡也需幫忙,2,0,thirties,male,
4,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377848.mp3,婚後一年生了個女嬰,2,1,thirties,male,


# Acoustic Model

In [7]:
# Prepare spk_id, gender and utt_id for all audios
client_spk = full_df[['client_id']].drop_duplicates()
client_spk['spk_id'] = range(1, 1 + len(client_spk))
client_spk.spk_id = client_spk.spk_id.apply(lambda x: str(x).zfill(3))
full_df = full_df.merge(client_spk)
full_df.gender = full_df.gender.apply(kaldi_gender)
full_df['utt_id'] = full_df.apply(path2utt, axis=1)

# Fix unusual character and do text segmentation
full_df.sentence = full_df.sentence.apply(fix_char)
full_df.sentence = full_df.sentence.apply(jieba_cut)

# Drop useless columns
drop_columns = ['client_id', 'up_votes', 'down_votes', 'age', 'accent']
full_df.drop(columns=drop_columns, inplace=True)

# Split processed dataset to train/test set
train_df = full_df[:-len(test_df)]
test_df = full_df[-len(test_df):]

# Sort train/test set by utt_id for kaldi-mfcc
train_df = train_df.sort_values(by='utt_id')
test_df = test_df.sort_values(by='utt_id')

In [8]:
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape:  {test_df.shape}')
train_df.head()

train_df.shape: (1697, 5)
test_df.shape:  (1535, 5)


Unnamed: 0,path,sentence,gender,spk_id,utt_id
0,common_voice_zh-TW_17377831.mp3,"[我們, 特別, 回鄉, 下]",m,1,001_17377831
1,common_voice_zh-TW_17377841.mp3,"[是, 歷史, 上, 的, 第二次]",m,1,001_17377841
2,common_voice_zh-TW_17377844.mp3,"[簡單, 來說]",m,1,001_17377844
3,common_voice_zh-TW_17377846.mp3,"[在, 田裡, 也, 需, 幫忙]",m,1,001_17377846
4,common_voice_zh-TW_17377848.mp3,"[婚後, 一年生, 了, 個, 女嬰]",m,1,001_17377848


### Write files for kaldi usage

In [9]:
# Write acoustic data of train/test set
for split, df in zip(['train', 'test'], [train_df, test_df]):
    
    # spk2gender
    spk2gender = df[['spk_id', 'gender']].drop_duplicates()
    spk2gender = spk2gender.sort_values(by='spk_id')
    with open(join('../data', split, 'spk2gender'), 'w', encoding='UTF-8') as f:
        for _, row in spk2gender.iterrows():
            f.write(f'{row.spk_id} {row.gender}\n')
            
    # wav.scp
    with open(join('../data', split, 'wav.scp'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            mp3_path = join(DATA_DIR, 'clips', row.path)
            f.write(f'{row.utt_id} sox {mp3_path} -t wav -r 16000 - |\n')
    
    # text
    with open(join('../data', split, 'text'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            text = ' '.join(row.sentence)
            f.write(f'{row.utt_id} {text}\n')
            
    # utt2spk
    with open(join('../data', split, 'utt2spk'), 'w', encoding='UTF-8') as f:
        for _, row in df.iterrows():
            f.write(f'{row.utt_id} {row.spk_id}\n')

# Language Model

In [10]:
# Build corpus to train language model
corpus = full_df.sentence.apply(' '.join).tolist()

# Build lexicon with zhuyin package
sents = full_df.sentence.tolist()
sents_phones = [sent2phones(sent, use_tone) for sent in sents]
lexicon = set()
for sent, sent_phones in zip(sents, sents_phones):
    lexicon.update(zip(sent, sent_phones))
    
# Build phone set from lexicon
phone_set = set()
for _, zhuyins in lexicon:
    for zhuyin in zhuyins.split():
        if use_tone:
            phones = [c for c in zhuyin[:-2]]
            phones.append(zhuyin[-2:])
            phone_set.update(phones)
        else:
            phones = [c for c in zhuyin]
            phone_set.update(phones)

In [11]:
corpus

['我們 特別 回鄉 下',
 '是 歷史 上 的 第二次',
 '簡單 來說',
 '在 田裡 也 需 幫忙',
 '婚後 一年生 了 個 女嬰',
 '途中 雪景 又 是 一絕',
 '管制 都市 汙染 源',
 '經過 自動 電 扶梯',
 '迷惘 的 眼神',
 '增加 僱用 弱勢 的 動機',
 '一陣子 沒 看到 妳',
 '旁邊 就 有 哈密瓜',
 '弄錯 蠻多 東西 的',
 '又 是 一個 付費 的 全民 公測',
 '不 知道 為何 陷入 孤獨',
 '他 是 我 父親',
 '報 中央 目的 事業 主管機關 備查',
 '下車 後 我 看 了 看錶',
 '讓 世界 多 了 幾許 笑容',
 '一定 可以 擺脫困境',
 '把 可能 產生 的 損害 降到 最低',
 '想要 去 金石 堂',
 '即使 免費 服務',
 '聽說 要架 腳架 才行',
 '很快 就 沒事 了',
 '她 怕 大家 也 變得 不 方便',
 '補好 衣服 的 破洞',
 '永遠 不會 疲倦 的 旅程',
 '真的 是 有 八七 分像',
 '她 不假思索 地 答應 了',
 '大 姑姑 說服 兒子',
 '就 好比 一間 披薩 店',
 '借住 在 別人 屋簷 下',
 '半個 月 就 不 去 了',
 '在 準備 妥適 後',
 '更是 不能 怠慢',
 '我 感到 驚訝 萬分',
 '應該 要 拿 更 大 的 皮箱 出門',
 '學 了 又 用 不到',
 '一定 要 想個辦法',
 '我 背後 的 這幅 畫',
 '她 來不及 反應',
 '這也 是 我們 臺灣人 的 真情',
 '聽到 了 有點 罪惡感',
 '隔天 早起 一邊 讀書',
 '妻子 受不了 而 選擇 離婚',
 '簡易 行政 工作',
 '反正 孫女 也 出生 了',
 '它 基本上 就是 一個 氣球',
 '妥善處理 垃圾 及 資源 回收',
 '他 知道 她 動怒 了',
 '還是 只能 搭 終電',
 '一直 覺得 文筆 不錯',
 '直奔 去 吃螃蟹 吃 到 飽',
 '巴士 來 了 就 趕快 上車',
 '希望 能 啟發 團隊',
 '臺北 市長 柯文 哲',
 '你 有沒有 帶 貼紙',
 '更 應該 加強 平日 的 危機意識',
 '且 未來 若 進行 工程 需先 討論',


In [12]:
lexicon

{('請加', 'ㄑㄧㄥ ㄐㄧㄚ'),
 ('體系', 'ㄊㄧ ㄒㄧ'),
 ('討好', 'ㄊㄠ ㄏㄠ'),
 ('今晚', 'ㄐㄧㄣ ㄨㄢ'),
 ('具', 'ㄐㄩ'),
 ('這種', 'ㄓㄟ ㄓㄨㄥ'),
 ('名稱', 'ㄇㄧㄥ ㄔㄥ'),
 ('擺脫困境', 'ㄅㄞ ㄊㄨㄛ ㄎㄨㄣ ㄐㄧㄥ'),
 ('十二三萬', 'ㄕ ㄦ ㄙㄢ ㄨㄢ'),
 ('人命', 'ㄖㄣ ㄇㄧㄥ'),
 ('欠佳', 'ㄑㄧㄢ ㄐㄧㄚ'),
 ('一邊', 'ㄧ ㄅㄧㄢ'),
 ('主辦', 'ㄓㄨ ㄅㄢ'),
 ('福利', 'ㄈㄨ ㄌㄧ'),
 ('快熱死', 'ㄎㄨㄞ ㄖㄜ ㄙ'),
 ('看不到', 'ㄎㄢ ㄅㄨ ㄉㄠ'),
 ('完後就', 'ㄨㄢ ㄏㄡ ㄐㄧㄡ'),
 ('除夕', 'ㄔㄨ ㄒㄧ'),
 ('生態', 'ㄕㄥ ㄊㄞ'),
 ('津貼', 'ㄐㄧㄣ ㄊㄧㄝ'),
 ('高峰', 'ㄍㄠ ㄈㄥ'),
 ('把', 'ㄅㄚ'),
 ('合計', 'ㄏㄜ ㄐㄧ'),
 ('店員', 'ㄉㄧㄢ ㄩㄢ'),
 ('發想', 'ㄈㄚ ㄒㄧㄤ'),
 ('先幫', 'ㄒㄧㄢ ㄅㄤ'),
 ('全幅', 'ㄑㄩㄢ ㄅㄧ'),
 ('來到', 'ㄌㄞ ㄉㄠ'),
 ('重心', 'ㄓㄨㄥ ㄒㄧㄣ'),
 ('國有', 'ㄍㄨㄛ ㄧㄡ'),
 ('依本', 'ㄧ ㄅㄣ'),
 ('期滿', 'ㄑㄧ ㄇㄢ'),
 ('奶香', 'ㄋㄞ ㄒㄧㄤ'),
 ('廣告', 'ㄍㄨㄤ ㄍㄠ'),
 ('最終', 'ㄗㄨㄟ ㄓㄨㄥ'),
 ('運用', 'ㄩㄣ ㄩㄥ'),
 ('輾轉', 'ㄓㄢ ㄓㄨㄢ'),
 ('武田', 'ㄨ ㄊㄧㄢ'),
 ('可', 'ㄎㄜ'),
 ('小朋友', 'ㄒㄧㄠ ㄆㄥ ㄧㄡ'),
 ('想必', 'ㄒㄧㄤ ㄅㄧ'),
 ('交接', 'ㄐㄧㄠ ㄐㄧㄝ'),
 ('不回', 'ㄈㄡ ㄏㄨㄟ'),
 ('民宿', 'ㄇㄧㄣ ㄒㄧㄡ'),
 ('將與', 'ㄐㄧㄤ ㄩ'),
 ('忘', 'ㄨㄤ'),
 ('痛', 'ㄊㄨㄥ'),
 ('掛念', 'ㄍㄨㄚ ㄋㄧㄢ'),
 ('省下', 'ㄕㄥ ㄒㄧㄚ'),
 ('家裡', 'ㄐㄧㄚ ㄌㄧ'),
 ('年輕', 'ㄋㄧㄢ ㄑㄧㄥ'),
 ('薪資', 'ㄒㄧㄣ ㄗ'),
 ('椅子', 'ㄧ ㄗ'),

In [13]:
phone_set

{'ˋ',
 'ㄅ',
 'ㄆ',
 'ㄇ',
 'ㄈ',
 'ㄉ',
 'ㄊ',
 'ㄋ',
 'ㄌ',
 'ㄍ',
 'ㄎ',
 'ㄏ',
 'ㄐ',
 'ㄑ',
 'ㄒ',
 'ㄓ',
 'ㄔ',
 'ㄕ',
 'ㄖ',
 'ㄗ',
 'ㄘ',
 'ㄙ',
 'ㄚ',
 'ㄛ',
 'ㄜ',
 'ㄝ',
 'ㄞ',
 'ㄟ',
 'ㄠ',
 'ㄡ',
 'ㄢ',
 'ㄣ',
 'ㄤ',
 'ㄥ',
 'ㄦ',
 'ㄧ',
 'ㄨ',
 'ㄩ'}

### Write for kaldi usage

In [14]:
# corpus.txt
with open('../data/local/corpus.txt', 'w', encoding='UTF-8') as f:
    for sent in corpus:
        f.write(f'{sent}\n')
        
# lexicon.txt
with open('../data/local/dict/lexicon.txt', 'w', encoding='UTF-8') as f:
    f.write('!SIL sil\n')
    f.write('<UNK> spn\n')
    for word, phones in lexicon:
        f.write(f'{word} {phones}\n')
        
# nonsilence_phones.txt
with open('../data/local/dict/nonsilence_phones.txt', 'w', encoding='UTF-8') as f:
    for phone in phone_set:
        f.write(f'{phone}\n')

# silence_phones.txt
with open('../data/local/dict/silence_phones.txt', 'w', encoding='UTF-8') as f:
    f.write('sil\nspn')
    
# optional_silence.txt
with open('../data/local/dict/optional_silence.txt', 'w', encoding='UTF-8') as f:
    f.write('sil')