In [1]:
#下载
#https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -> train/LJSpeech-1.1

In [2]:
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.datasets import load_tts_samples
import os
import numpy as np


def init_config():
    dataset_config = BaseDatasetConfig(name='ljspeech',
                                       path='train/LJSpeech-1.1/',
                                       meta_file_train='metadata.csv')

    config = GlowTTSConfig(
        batch_size=32,
        eval_batch_size=16,
        num_loader_workers=4,
        num_eval_loader_workers=4,
        run_eval=True,
        test_delay_epochs=-1,
        epochs=100,
        text_cleaner='phoneme_cleaners',
        use_phonemes=True,
        phoneme_language='en-us',
        phoneme_cache_path='train/phoneme_cache',
        print_step=25,
        print_eval=False,
        mixed_precision=True,
        output_path='train',
        datasets=[dataset_config],
        save_step=1000,
        data_dep_init_steps=0,
    )

    processor = AudioProcessor.init_from_config(config)
    tokenizer, config = TTSTokenizer.init_from_config(config)

    datas, _ = load_tts_samples(
        dataset_config,
        eval_split=True,
        eval_split_size=0.001,
    )

    #排序
    lens = [os.path.getsize(i['audio_file']) for i in datas]
    ids = np.argsort(lens)
    datas = [datas[i] for i in ids]

    return config, processor, tokenizer, datas


config, processor, tokenizer, datas = init_config()

out = processor.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav')
print('processor.load_wav=', out, out.shape)

out = tokenizer.text_to_ids(
    'it is obvious that legibility is the first thing to be aimed at in the forms of the letters'
)
print('tokenizer.text_to_ids=', out, len(out))

out = processor.melspectrogram(
    processor.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav'))
print('processor.melspectrogram=', out.shape)

len(datas), datas[:2]

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 13100 files in /root/code/tts/train/LJSpeech-1.1
processor.load_wav= [-0.00167847 -0.00149536  0.00015259 ...  0.00091553  0.00183105
  0.0012207 ] (127205,)
tokenizer.text_to_ids= [64, 22, 130, 64, 28, 130, 110, 41, 112, 5, 24, 64, 49, 21, 130, 3

(13087,
 [{'text': 'Two young women,\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ037-0076.wav',
   'speaker_name': 'ljspeech',
   'root_path': 'train/LJSpeech-1.1/',
   'language': ''},
  {'text': 'On the contrary,\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ021-0165.wav',
   'speaker_name': 'ljspeech',
   'root_path': 'train/LJSpeech-1.1/',
   'language': ''}])

In [3]:
from TTS.tts.models.glow_tts import GlowTTS
from trainer import Trainer, TrainerArgs
from TTS.utils.radam import RAdam
from trainer.torch import NoamLR
from TTS.tts.layers.losses import GlowTTSLoss


def init_model(from_trainer):
    model = GlowTTS(config, processor, tokenizer, speaker_manager=None)
    model.run_data_dep_init = False

    if from_trainer:
        trainer = Trainer(args=TrainerArgs(),
                          config=config,
                          output_path='train',
                          model=model,
                          train_samples=datas,
                          eval_samples=None)
        optimizer = trainer.get_optimizer(model, config)
        scheduler = trainer.get_scheduler(model, config, optimizer)
        criterion = trainer.get_criterion(model)
        loader = trainer.get_train_dataloader({}, datas, verbose=True)
    else:
        optimizer = RAdam(model.parameters(),
                          lr=1e-3,
                          betas=[0.9, 0.998],
                          weight_decay=1e-6)
        scheduler = NoamLR(optimizer, warmup_steps=4000)
        criterion = GlowTTSLoss()
        loader = model.get_data_loader(config=config,
                                       assets={},
                                       is_eval=False,
                                       samples=datas,
                                       verbose=True,
                                       num_gpus=0)

    return model, optimizer, scheduler, criterion, loader


model, optimizer, scheduler, criterion, loader = init_model(from_trainer=False)

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

optimizer, scheduler, criterion, loader



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
| > Number of instances : 13087
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.92282417666387
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 145010.87774126997
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
2861.0257


(RAdam (
 Parameter Group 0
     betas: [0.9, 0.998]
     buffer: [[None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None], [None, None, None]]
     eps: 1e-08
     initial_lr: 0.001
     lr: 2.5e-07
     weight_decay: 1e-06
 ),
 <trainer.torch.NoamLR at 0x7f39c4cf9490>,
 GlowTTSLoss(),
 <torch.utils.data.dataloader.DataLoader at 0x7f39c4c4b970>)

In [4]:
import torch


def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model.train()
    model = model.to(device)

    for epoch in range(config.epochs):
        for i, data in enumerate(loader):
            data = model.format_batch(data)
            for k in data.keys():
                if isinstance(data[k], torch.Tensor):
                    data[k] = data[k].to(device)

            _, loss_dict = model.train_step(data, criterion)

            model.zero_grad(set_to_none=True)
            loss_dict['loss'].backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            if i % 50 == 0:
                lr = optimizer.state_dict()['param_groups'][0]['lr']
                print(epoch, i, loss_dict['loss'].item(), lr)

        scheduler.step()

    config.save_json('train/config.json')
    model = model.cpu()
    torch.save({
        'config': config.to_dict(),
        'model': model.state_dict()
    }, 'train/model.pth')


train()

0 0 4.965285301208496 2.5e-07
0 50 4.48514461517334 2.5e-07
0 100 4.408485412597656 2.5e-07
0 150 4.390215873718262 2.5e-07
0 200 4.34619140625 2.5e-07
0 250 4.2658915519714355 2.5e-07
0 300 4.232598304748535 2.5e-07
0 350 4.24269962310791 2.5e-07
0 400 4.206293106079102 2.5e-07
1 0 4.7883806228637695 2.5e-07
1 50 4.3644185066223145 2.5e-07
1 100 4.231471538543701 2.5e-07
1 150 4.140634536743164 2.5e-07
1 200 4.162438869476318 2.5e-07
1 250 3.991492986679077 2.5e-07
1 300 3.985748767852783 2.5e-07
1 350 3.925426721572876 2.5e-07
1 400 3.9146857261657715 2.5e-07
2 0 4.61428689956665 5e-07
2 50 4.081521511077881 5e-07
2 100 3.92360258102417 5e-07
2 150 3.776669502258301 5e-07
2 200 3.668335437774658 5e-07
2 250 3.533665180206299 5e-07
2 300 3.2952823638916016 5e-07
2 350 3.274540901184082 5e-07
2 400 3.272976875305176 5e-07
3 0 3.885641098022461 7.5e-07
3 50 3.3418655395507812 7.5e-07
3 100 3.1183154582977295 7.5e-07
3 150 3.115149974822998 7.5e-07
3 200 2.97013783454895 7.5e-07
3 250 2.

25 200 0.4414508640766144 6.2499999999999995e-06
25 250 0.45825737714767456 6.2499999999999995e-06
25 300 0.42916059494018555 6.2499999999999995e-06
25 350 0.41437339782714844 6.2499999999999995e-06
25 400 0.4265383780002594 6.2499999999999995e-06
26 0 0.4382443428039551 6.5e-06
26 50 0.4102570414543152 6.5e-06
26 100 0.43702322244644165 6.5e-06
26 150 0.4107848107814789 6.5e-06
26 200 0.4235208034515381 6.5e-06
26 250 0.41843685507774353 6.5e-06
26 300 0.3986942172050476 6.5e-06
26 350 0.3911230266094208 6.5e-06
26 400 0.38910168409347534 6.5e-06
27 0 0.40227577090263367 6.75e-06
27 50 0.3775468170642853 6.75e-06
27 100 0.39839881658554077 6.75e-06
27 150 0.38867729902267456 6.75e-06
27 200 0.3871133625507355 6.75e-06
27 250 0.38443538546562195 6.75e-06
27 300 0.37685149908065796 6.75e-06
27 350 0.3543701171875 6.75e-06
27 400 0.36628299951553345 6.75e-06
28 0 0.4038543701171875 7e-06
28 50 0.35466471314430237 7e-06
28 100 0.387215793132782 7e-06
28 150 0.37291383743286133 7e-06
28 20

49 50 0.10192348062992096 1.225e-05
49 100 0.10113847255706787 1.225e-05
49 150 0.08244265615940094 1.225e-05
49 200 0.10718122124671936 1.225e-05
49 250 0.07683245837688446 1.225e-05
49 300 0.0877557247877121 1.225e-05
49 350 0.07005485892295837 1.225e-05
49 400 0.06587940454483032 1.225e-05
50 0 0.0744716078042984 1.2499999999999999e-05
50 50 0.08619588613510132 1.2499999999999999e-05
50 100 0.08470742404460907 1.2499999999999999e-05
50 150 0.08289980888366699 1.2499999999999999e-05
50 200 0.09856045246124268 1.2499999999999999e-05
50 250 0.07797922194004059 1.2499999999999999e-05
50 300 0.08481322228908539 1.2499999999999999e-05
50 350 0.06416761875152588 1.2499999999999999e-05
50 400 0.062316253781318665 1.2499999999999999e-05
51 0 0.07378756999969482 1.275e-05
51 50 0.08146421611309052 1.275e-05
51 100 0.08855710923671722 1.275e-05
51 150 0.07912230491638184 1.275e-05
51 200 0.09281843900680542 1.275e-05
51 250 0.06541755795478821 1.275e-05
51 300 0.07916593551635742 1.275e-05
51 

72 400 -0.03126619756221771 1.8e-05
73 0 -0.020297512412071228 1.825e-05
73 50 -0.012664929032325745 1.825e-05
73 100 -0.007032647728919983 1.825e-05
73 150 -0.018247783184051514 1.825e-05
73 200 0.006371170282363892 1.825e-05
73 250 -0.022321000695228577 1.825e-05
73 300 -0.00747796893119812 1.825e-05
73 350 -0.0324552059173584 1.825e-05
73 400 -0.03358253836631775 1.825e-05
74 0 -0.029705941677093506 1.8500000000000002e-05
74 50 -0.012921571731567383 1.8500000000000002e-05
74 100 -0.009741067886352539 1.8500000000000002e-05
74 150 -0.01602776348590851 1.8500000000000002e-05
74 200 0.0010616183280944824 1.8500000000000002e-05
74 250 -0.0270373672246933 1.8500000000000002e-05
74 300 -0.017619669437408447 1.8500000000000002e-05
74 350 -0.03013424575328827 1.8500000000000002e-05
74 400 -0.03708323836326599 1.8500000000000002e-05
75 0 -0.03209860622882843 1.875e-05
75 50 -0.011638477444648743 1.875e-05
75 100 -0.004601866006851196 1.875e-05
75 150 -0.021368786692619324 1.875e-05
75 200 -0

95 100 -0.05777783691883087 2.375e-05
95 150 -0.06929260492324829 2.375e-05
95 200 -0.04631879925727844 2.375e-05
95 250 -0.07674089074134827 2.375e-05
95 300 -0.071486696600914 2.375e-05
95 350 -0.08336281776428223 2.375e-05
95 400 -0.0900769829750061 2.375e-05
96 0 -0.09226559102535248 2.4e-05
96 50 -0.06494851410388947 2.4e-05
96 100 -0.06169390678405762 2.4e-05
96 150 -0.07028934359550476 2.4e-05
96 200 -0.05467931926250458 2.4e-05
96 250 -0.08125145733356476 2.4e-05
96 300 -0.06442229449748993 2.4e-05
96 350 -0.08614224195480347 2.4e-05
96 400 -0.09134021401405334 2.4e-05
97 0 -0.09091292321681976 2.425e-05
97 50 -0.05828048288822174 2.425e-05
97 100 -0.0636373907327652 2.425e-05
97 150 -0.07175800204277039 2.425e-05
97 200 -0.06063464283943176 2.425e-05
97 250 -0.07955768704414368 2.425e-05
97 300 -0.07061606645584106 2.425e-05
97 350 -0.08217279613018036 2.425e-05
97 400 -0.09288357198238373 2.425e-05
98 0 -0.100023552775383 2.45e-05
98 50 -0.07319198548793793 2.45e-05
98 100 -0

In [5]:
from TTS.utils.synthesizer import Synthesizer
import IPython

synthesizer = Synthesizer(
    tts_checkpoint='train/model.pth',
    tts_config_path='train/config.json',
    use_cuda=False,
)

synthesizer.save_wav(synthesizer.tts('a quick brown fox jumps over the lazy dog'), 'train/out.wav')

IPython.display.Audio('train/out.wav')

 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text splitted to sentences.
['a quick brown fox jumps over the lazy dog']
 > Processing time: 1.8999865055084229
 > Real-time factor: 0.5312272069190089
