# 日文预处理函数

## pykakasi自带分词和转换

In [34]:
from pykakasi import kakasi, wakati
kakasi_ins = kakasi()
kakasi_ins.setMode("H","a") # Hiragana to ascii, default: no conversion
kakasi_ins.setMode("K","a") # Katakana to ascii, default: no conversion
kakasi_ins.setMode("J","a") # Japanese to ascii, default: no conversion
kakasi_ins.setMode("r","Hepburn") # default: use Hepburn Roman table
kakasi_ins.setMode("s", True) # add space, default: no separator
# kakasi_ins.setMode("C", True) # capitalize, default: no capitalize
conv_pykakasi = kakasi_ins.getConverter()

wakati_ins = wakati()
sep_pykakasi = wakati_ins.getConverter()

def pykakasi_sep(text):
    text = text.replace("「","")
    text = text.replace("」","")
    text = sep_pykakasi.do(text)
    return text

def pykakasi_conv(text):
    text = text.replace("「","")
    text = text.replace("」","")
    text = conv_pykakasi.do(text)
    return text

## mecab分词+pykakasi转换

In [35]:
import romkan
import MeCab
from unidecode import unidecode
wakati_mecab = MeCab.Tagger("-Owakati")
# yomi = MeCab.Tagger("-Oyomi")
# chasen = MeCab.Tagger("-Ochasen")

kakasi_ins2 = kakasi()
kakasi_ins2.setMode("H","a") # Hiragana to ascii, default: no conversion
kakasi_ins2.setMode("K","a") # Katakana to ascii, default: no conversion
kakasi_ins2.setMode("J","a") # Japanese to ascii, default: no conversion
kakasi_ins2.setMode("r","Hepburn") # default: use Hepburn Roman table
# kakasi_ins2.setMode("C", True) # capitalize, default: no capitalize
conv_pykakasi_nosep = kakasi_ins2.getConverter()

def mecab_sep(text):
    text = text.replace("「","")
    text = text.replace("」","")
    text = wakati_mecab.parse(text)
    return text

def mecab_conv(text):
    text = text.replace("「","")
    text = text.replace("」","")
    text = conv_pykakasi_nosep.do(wakati_mecab.parse(text))
    return text

In [36]:
text = "津田は自分の都合を善く考えてから日取をきめる事にして室外に出た。章、終り。"
target = "tsuda wa jibun no tsugo- wo yoku kangae te kara hi to wo kimeru koto ni shi te shitsugai ni de ta 。 akira 、 owari 。"
text = "「腸まで続いているとすると、癒りっこないんですか」「そんな事はありません」"
target = "cho- made tsuzui te iru to suru to 、 nao ri k ko nai n desu ka son'na koto wa ari mase n"
print(pykakasi_sep(text))
print(pykakasi_conv(text))
print(mecab_sep(text))
print(mecab_conv(text))

腸 まで 続い ているとすると、 癒 りっこないんですかそんな 事 はありません
chou made tsuzui teirutosuruto、 yu rikkonaindesukasonna koto haarimasen
腸 まで 続い て いる と する と 、 癒り っこ ない ん です か そんな 事 は あり ませ ん 

chou made tsuzui te iru to suru to 、 yuri kko nai n desu ka sonna koto ha ari mase n 



# 中文预处理函数

In [1]:
# from pypinyin import lazy_pinyin, Style, slug
import re, jieba, pinyin
import pandas as pd
import glob,os
import sys
sys.path.append('../')
from utils.text import to_pinyin, _phonemize, _phonemize_zh,remove_punctuation

def to_pinyin_pre(text):
    return ' '.join([pinyin.get(x) for x in jieba.cut(text.rstrip())])



In [38]:
s="上期主要讲了人类的直立行走，一个因自然环境变化。"
print(to_pinyin_pre(s))
print(to_pinyin(s,style="Style.TONE3"))
print(_phonemize_zh(s))

shàngqī zhǔyào jiǎng le rénlèi de zhílì xíngzǒu ， yīgè yīn zìránhuánjìng biànhuà 。
shang4qi1 zhu3yao4 jiang3 le ren2lei4 de zhi2li4 xing2zou3， yi2ge4 yin1 zi4ran2huan2jing4 bian4hua4。
s.ɑŋ4 tɕhi1 ts.u3 jɑu4 tɕiɑŋ3 lə ʐon2 lei4 tə ts.i.2 li4 ɕiŋ2 tsou3 ， ji2 ko4 jin1 tsi̪4 ʐɑn2 xwɑn2 tɕiŋ4 piɑn4 xwɑ4 。


# 数据库读取数据集

In [39]:
import pandas as pd 
import os
from sqlalchemy import create_engine
from tqdm.notebook import tqdm
import sys
from multiprocessing import Pool
engine = create_engine("mysql+mysqlconnector://root:123456@172.21.0.182:3300/Voice_data")

aishell数据集质量不好，aidatatang质量参差不齐，LibriTTS说话人太多

In [7]:
columns = "a.relative_path,b.voice_id, b.speaker, b.language, b.voice_path, b.voice_text"
chart ="dataset_registry a, dataset_voice_data b"

condition="""
a.dataset_id = b.dataset_id
AND b.dataset_id IN ('ShantouTV_rs22k','css10_zh','thchs30_rs22k','LJSpeech','kss_rs22k','css10_ja')
AND `voice_duration` BETWEEN '1' AND '13'
"""

df=pd.read_sql(f"select {columns} from {chart} where {condition}",con=engine)
df.shape

(49678, 6)

In [8]:
pd.set_option('display.max_rows', None)
print(df['language'].value_counts())
print('The number of speakers:',df['speaker'].nunique())

zh    16887
en    13100
ko    12852
ja     6839
Name: language, dtype: int64
The number of speakers: 65


In [9]:
def gen_metadata(dataframe, file_path):
        with open(file_path,'a+') as f:
            item = dict(dataframe[1])
            absolute_path = os.path.join('/data/jerik/VoiceProject/voice_database_v2/',item['relative_path'])
            audio_path = os.path.join(absolute_path,item['voice_path'])
            #音素化中文用_phonemize_zh，其他用_phonemize。具体查看text.py docstring
            try:
                if item['language']=='zh':
                    char_text = to_pinyin(item['voice_text'],style='Style.TONE3')
                    phonemize_text = _phonemize_zh(item['voice_text'])
                elif item['language']=='en':
                    char_text = item['voice_text']
                    phonemize_text = _phonemize(item['voice_text'],'en-us')
                elif item['language']=='ko':
                    char_text = item['voice_text']
                    phonemize_text = _phonemize(item['voice_text'],'ko')
                elif item['language']=='de':
                    char_text = item['voice_text']
                    phonemize_text = _phonemize(item['voice_text'],'de')
                elif item['language']=='ja':
                    char_text = item['voice_text']
                    phonemize_text = _phonemize(item['voice_text'],'ja')

                #判断音频路径是否正确
                if os.path.exists(audio_path):
                    print(f"{item['voice_id']}|{item['speaker']}|{item['language']}|{audio_path}|||{char_text}|{phonemize_text}",file=f)
                else:
                    print("Audio file do not exist!!!")
            except IndexError:
                #一般由于中文句子包含了英文单词会抛出IndexError
                print(f"While handling {item['voice_id']}, occur an unexpected error:{sys.exc_info()[1]}")

def update(*dataframe):
    pbar.update(1)

if __name__ == '__main__':

    output_dir ="train_ShantouTV_4lang"
    file_name = 'data_0421.txt'
    file_path = os.path.join(output_dir,file_name) 

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    if os.path.exists(file_path):
        os.remove(file_path)
        
    #视数据量决定，30w数据128进程约1小时
    pool =Pool(60)
    pbar = tqdm(total=len(df))   
    
    try:
        for i in df.iterrows():
            pool.apply_async(gen_metadata, args=(i, file_path), callback=update)
    except:
        pool.terminate()
        pool.close()
        pool.join()   
    finally:
        pool.close()
        pool.join()  

  0%|          | 0/49678 [00:00<?, ?it/s]
















While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_113_xiaodi, occur an unexpected error:list index out of range
While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_114_xiaodi, occur an unexpected error:list index out of range
While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_162_xiaodi, occur an unexpected error:list index out of range
While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_23_xiaodi, occur an unexpected error:list index out of range
While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_156_xiaodi, occur an unexpected error:list index out of range
While handling ShantouTV_rs22k-ShantouTV_xiaodi-ShantouTV_ST-06_80_xiaodi, occur an unexpected error:list index out of range


# 运行prepare_css_spectrograms.py

In [10]:
!CUDA_VISIBLE_DEVICES=0 python3 prepare_css_spectrograms.py \
    --data_directory  train_ShantouTV_4lang \
    --data_txt data_0421.txt  \
    --sample_rate 22050

Please wait, this may take a very long time.
Creating spectrograms for: data_0421.txt
Done.


# 修改metafile内容生成train和val

In [None]:
file_path = os.path.join(output_dir,file_name) 
file_path

In [12]:
from sklearn.model_selection import train_test_split

with open(file_path, encoding='utf-8') as f:
    textlist = f.readlines()
train_list , val_list  = train_test_split(textlist,test_size=0.03, random_state=0)
len(textlist),len(train_list),len(val_list)

(49672, 48181, 1491)

In [13]:
train_file = os.path.join(output_dir,'train.txt')
val_file = os.path.join(output_dir,'val.txt')
with open(train_file, 'w', encoding='utf-8') as f:
    for text in train_list:
        f.write(text)
with open(val_file, 'w', encoding='utf-8') as f:
    for text in val_list:
        f.write(text)

### 175上已有数据文件需要更改文件路径

In [18]:
with open('./train_ko_0318/val.txt','r',encoding='utf-8') as f:
    with open('./train_ko_0318/val2.txt','w',encoding='utf-8') as f2:
        data=f.readlines()
        for lines in data:
                lines=lines.replace('raid','data')
                f2.write(lines)

# 生成json config文件

## 核对数据集IPA字符

In [5]:
dict_tmp={}
with open('./train_ShantouTV_4lang/data_0421.txt','r',encoding='utf-8') as f:
    data_tmp = f.readlines()
    
for i in data_tmp:
    ipa_text = remove_punctuation(i.split('|')[-1].strip())
    ipa_text = ipa_text.replace(u'\u3000','')
    for ipa in list(ipa_text):       
        dict_tmp[ipa]=1
        

In [15]:
''.join(list(dict_tmp.keys()))

'kɯᵝtɕimo̞däeɽʲjnsɴhwzpɡʑbʔːũɸŋçʒaəlvfʃɪĩɑx4̪3yuɛ21ʐr5ɐʌɾqɫʊæɹɚðθɜɔᵻ̩'

In [18]:
import json

json_file ='20210401_4lang_ShantouTV_22k.json'
json_dict={
    "balanced_sampling": True,
	"batch_size": 120,
	"case_sensitive": False,
	"checkpoint_each_epochs": 20,
    # Change the dataset    
	"dataset": "train_ShantouTV_4lang",
	"encoder_dimension": 256,
	"encoder_type": "generated",
	"epochs": 1000,
    # According to the author, generator_dim should be the number of lang, and generator_bottleneck_dim should be smaller than it.
	"generator_bottleneck_dim": 2, 
	"generator_dim": 4,	
	"languages": ["zh","en","ja","ko"],
	"language_embedding_dimension": 0,
	"learning_rate": 0.001,
	"learning_rate_decay_each": 10000,
	"learning_rate_decay_start": 10000,
    #如果核对数据集中存在额外字符需要加进来
   "phonemes":"ᵝ̞äʲũĩ12345iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧ ɚ˞ɫ",
	"use_phonemes":True,
    "multi_language": False,
	"multi_speaker": True,
	"perfect_sampling": True,
	"predict_linear": False,
   # If only one speaker, reversal_classifier shoule be False
	"reversal_classifier": True,
    "reversal_classifier_dim": 256,
    "reversal_classifier_w": 0.125,
    "reversal_gradient_clipping": 0.25,
    # speaker_embedding_dim should no smaller than the number of speaker in each language
	"speaker_embedding_dimension": 65,
	"version": json_file[:-5]
}


In [19]:
for i in dict_tmp.keys():
    if i not in json_dict['phonemes']:
        print(i)

̪
̩


In [20]:
with open(os.path.join('../params/',json_file),'w', encoding='utf-8') as f:
    json.dump(json_dict, f, ensure_ascii=False, indent=2)

**初次开始训练**

CUDA_VISIBLE_DEVICES=3,4 PYTHONIOENCODING=utf-8 nohup python3 -u train.py --max_gpus 2 --hyper_parameters 20210125_2lang_22k_char >./20210125_2lang_22k_char_nohup.txt 2>&1 &

**从checkpoint继续训练**

CUDA_VISIBLE_DEVICES=3,4 PYTHONIOENCODING=utf-8 nohup python3 -u train.py --max_gpus 2 --checkpoint 20210125_2lang_22k_char_loss-19-0.088 --hyper_parameters 20210125_2lang_22k_char >./20210125_2lang_22k_char_nohup.txt 2>&1 &

In [None]:
CUDA_VISIBLE_DEVICES=4,5 PYTHONIOENCODING=utf-8 nohup python3 -u train.py --max_gpus 2 --hyper_parameters 20210401_4lang_ShantouTV_22k >./20210401_4lang_ShantouTV_22k_nohup.txt 2>&1 &