In [1]:
import os
from os.path import join
import pandas as pd

import jieba
jieba.set_dictionary('dict.txt.big')
jieba.initialize()

DATA_DIR = '/home/jiazhi/Dataset/common-voice_zh-TW_43h_2019-06-12'

Building prefix dict from /home/jiazhi/Workshop/common-voice-zh-tw/dict.txt.big ...
Loading model from cache /tmp/jieba.u508758312a92fec15e548748b1e3d333.cache
Loading model cost 0.862 seconds.
Prefix dict has been built succesfully.


In [2]:
def kaldi_gender(gender):
    ''' Alter CommonVoice gender into kaldi format. '''
    if gender == 'male':
        return 'm'
    elif gender == 'female':
        return 'f'
    else:
        return 'm'
    
def path2utt(path):
    ''' Convert audio path into utterance id. '''
    return path[:-4].split('_')[-1]

def jieba_cut(sent):
    ''' Chinese segmentation with punctuations removed. '''
    def is_chinese(char):
        return u'\u4e00' <= char <= u'\u9fff'
    return [c for c in jieba.cut(sent) if is_chinese(c)]

def contains_no_eng(text):
    ''' Check if text contains no english. '''
    for char in text:
        if 'a' <= char <= 'z' or \
           'A' <= char <= 'Z' or \
           u'\uff21' <= char <= u'\uff3a' or \
           u'\uff41' <= char <= u'\uff5a':
            return False
    return True

In [3]:
# Create directory structures
os.mkdir('data')
os.mkdir('data/train')
os.mkdir('data/test')
os.mkdir('data/local')
os.mkdir('data/local/dict')

In [4]:
# Read and merge information of train/test set
train_tsv = join(DATA_DIR, 'train.tsv')
test_tsv = join(DATA_DIR, 'test.tsv')
train_df = pd.read_csv(train_tsv, sep='\t')
test_df = pd.read_csv(test_tsv, sep='\t')

# Exclude audios with english
train_df = train_df[train_df.sentence.apply(contains_no_eng)]
test_df = test_df[test_df.sentence.apply(contains_no_eng)]
full_df = pd.concat([train_df, test_df])

In [5]:
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape:  {test_df.shape}')
train_df.head()

train_df.shape: (1697, 8)
test_df.shape:  (1535, 8)


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377831.mp3,我們特別回鄉下,2,0,thirties,male,
1,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377841.mp3,是歷史上的第二次,2,0,thirties,male,
2,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377844.mp3,簡單來說,2,1,thirties,male,
3,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377846.mp3,在田裡也需幫忙,2,0,thirties,male,
4,729aa31a7cb19fbf96fb390f0e7e74dd408a2579ce811a...,common_voice_zh-TW_17377848.mp3,婚後一年生了個女嬰,2,1,thirties,male,


# Acoustic

In [6]:
# Prepare gender and speaker id for all CommonVoice clients
full_df.gender = full_df.gender.apply(kaldi_gender)
client_spk = full_df[['client_id']].drop_duplicates()
client_spk['spk_id'] = range(1, 1 + len(client_spk))
full_df = full_df.merge(client_spk)

# Prepare utterance id and text segmentation for all audios
full_df['utt_id'] = full_df.path.apply(path2utt)
full_df.sentence = full_df.sentence.apply(jieba_cut)

# Drop useless columns
drop_columns = ['client_id', 'up_votes', 'down_votes', 'age', 'accent']
full_df.drop(columns=drop_columns, inplace=True)

# Split processed dataset to train/test set
train_df = full_df[:-len(test_df)]
test_df = full_df[-len(test_df):]

In [7]:
print(f'train_df.shape: {train_df.shape}')
print(f'test_df.shape:  {test_df.shape}')
train_df.head()

train_df.shape: (1697, 5)
test_df.shape:  (1535, 5)


Unnamed: 0,path,sentence,gender,spk_id,utt_id
0,common_voice_zh-TW_17377831.mp3,"[我們, 特別, 回鄉, 下]",m,1,17377831
1,common_voice_zh-TW_17377841.mp3,"[是, 歷史, 上, 的, 第二次]",m,1,17377841
2,common_voice_zh-TW_17377844.mp3,"[簡單, 來說]",m,1,17377844
3,common_voice_zh-TW_17377846.mp3,"[在, 田裡, 也, 需, 幫忙]",m,1,17377846
4,common_voice_zh-TW_17377848.mp3,"[婚後, 一年生, 了, 個, 女嬰]",m,1,17377848


### Write files for kaldi usage

In [8]:
# Write acoustic data of train/test set
for split, df in zip(['train', 'test'], [train_df, test_df]):
    
    # spk2gender
    spk2gender = df[['spk_id', 'gender']].drop_duplicates()
    with open(join('data', split, 'spk2gender'), 'w') as f:
        for _, row in spk2gender.iterrows():
            f.write(f'{row.spk_id} {row.gender}\n')
            
    # wav.scp
    with open(join('data', split, 'wav.scp'), 'w') as f:
        for _, row in df.iterrows():
            mp3_path = join(DATA_DIR, 'clips', row.path)
            f.write(f'{row.utt_id} sox {mp3_path} -t wav - |\n')
    
    # text
    with open(join('data', split, 'text'), 'w') as f:
        for _, row in df.iterrows():
            text = ' '.join(row.sentence)
            f.write(f'{row.utt_id} {text}\n')
            
    # utt2spk
    with open(join('data', split, 'utt2spk'), 'w') as f:
        for _, row in df.iterrows():
            f.write(f'{row.utt_id} {row.spk_id}\n')

# Language