In [None]:
#下载
#https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -> train/LJSpeech-1.1

In [1]:
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer

dataset_config = BaseDatasetConfig(name='ljspeech',
                                   path='train/LJSpeech-1.1/',
                                   meta_file_train='metadata.csv')

config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner='phoneme_cleaners',
    use_phonemes=True,
    phoneme_language='en-us',
    phoneme_cache_path='train/phoneme_cache',
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path='train',
    datasets=[dataset_config],
    save_step=1000,
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

print(ap.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav'))

print(
    tokenizer.text_to_ids(
        'it is obvious that legibility is the first thing to be aimed at in the forms of the letters'
    ))

wav = ap.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav')
print(ap.melspectrogram(wav).shape)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
[-0.00167847 -0.00149536  0.00015259 ...  0.00091553  0.00183105
  0.0012207 ]
[64, 22, 130, 64, 28, 130, 110, 41, 112, 5, 24, 64, 49, 21, 130, 31, 29, 22, 130, 14, 111, 51, 7, 98, 49, 5, 110, 64, 14, 64, 82, 11, 130, 64, 28, 130, 31, 49, 130, 9, 110, 52, 11

In [2]:
from TTS.tts.datasets import load_tts_samples

train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

len(train_samples), train_samples[:2], len(eval_samples), eval_samples[:2]

 | > Found 13100 files in /root/code/train/LJSpeech-1.1


(12969,
 [{'text': 'I take this means of saying "thank you."\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ023-0005.wav',
   'speaker_name': 'ljspeech',
   'language': ''},
  {'text': 'Felons who could pay the price were permitted, irrespective of their character or offenses,\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ003-0141.wav',
   'speaker_name': 'ljspeech',
   'language': ''}],
 131,
 [{'text': 'A molecular change takes place in the nerve of the tentacle,\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ025-0134.wav',
   'speaker_name': 'ljspeech',
   'language': ''},
  {'text': 'through advances made to various builders, and that it could only maintain its credit by wholesale discounting.\n',
   'audio_file': 'train/LJSpeech-1.1/wavs/LJ010-0304.wav',
   'speaker_name': 'ljspeech',
   'language': ''}])

In [3]:
from TTS.tts.models.glow_tts import GlowTTS
from trainer import Trainer, TrainerArgs

model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
model.run_data_dep_init = False

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

trainer = Trainer(args=TrainerArgs(),
                  config=config,
                  output_path='train',
                  model=model,
                  train_samples=train_samples,
                  eval_samples=eval_samples)

trainer

 > Training Environment:
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 4
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False


2861.0257



 > Model has 28610257 parameters


<trainer.trainer.Trainer at 0x7efde37920b8>

In [4]:
trainer.fit()


[4m[1m > EPOCH: 0/2[0m
 --> train/run-November-04-2022_10+17AM-0000000

[1m > TRAINING (2022-11-04 10:17:43) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze

[1m   --> STEP: 0/406 -- GLOBAL_STEP: 0[0m
     | > current_lr: 0.00000 
     | > step_time: 0.82310  (0.82310)
     | > loader_time: 0.92440  (0.92438)


[1m   --> STEP: 25/406 -- GLOBAL_STEP: 25[0m
     | > loss: 3.55607  (3.44821)
     | > log_mle: 0.72510  (0.72490)
     | > loss_dur: 2.83097  (2.72331)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 10.40194  (9.54358)
     | > current_lr: 0.00000 
     | > step_time: 0.48830  (0.39498)
     | > loader_time: 0.00220  (0.00328)


[1m   --> STEP: 50/406 -- GLOBAL_STEP: 50[0m
     | > loss: 3.42936  (3.42850)
     | > log_mle: 0.72323  (0.72633)
     | > loss_dur: 2.70613  (2.70217)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 10.12216  (9.93688)
     | > current_lr: 0.00000 
     | > step_time: 0.26310  (0.35725)
     | > loader_time: 0.00300  (0.00727)


[1m   --> STEP: 75/406 -- GLOBAL_STEP: 75[0m
     | > loss: 3.4



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.65421 [0m(+0.00000)
     | > avg_loss: 3.40161 [0m(+0.00000)
     | > avg_log_mle: 0.72215 [0m(+0.00000)
     | > avg_loss_dur: 2.67946 [0m(+0.00000)

 > BEST MODEL : train/run-November-04-2022_10+17AM-0000000/best_model_406.pth

[4m[1m > EPOCH: 1/2[0m
 --> train/run-November-04-2022_10+17AM-0000000

[1m > TRAINING (2022-11-04 10:20:57) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


  y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze

[1m   --> STEP: 19/406 -- GLOBAL_STEP: 425[0m
     | > loss: 3.34082  (3.37424)
     | > log_mle: 0.72437  (0.72090)
     | > loss_dur: 2.61645  (2.65334)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 9.61567  (9.58040)
     | > current_lr: 0.00000 
     | > step_time: 0.23220  (0.24672)
     | > loader_time: 0.00250  (0.00625)


[1m   --> STEP: 44/406 -- GLOBAL_STEP: 450[0m
     | > loss: 3.26766  (3.33739)
     | > log_mle: 0.72682  (0.72295)
     | > loss_dur: 2.54083  (2.61444)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_norm: 9.43543  (9.54299)
     | > current_lr: 0.00000 
     | > step_time: 0.31680  (0.26047)
     | > loader_time: 0.00250  (0.01035)


[1m   --> STEP: 69/406 -- GLOBAL_STEP: 475[0m
     | > loss: 3.36283  (3.33539)
     | > log_mle: 0.73072  (0.72336)
     | > loss_dur: 2.63211  (2.61203)
     | > amp_scaler: 16384.00000  (16384.00000)
     | > grad_nor



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
| > Number of instances : 131
 | > Preprocessing samples
 | > Max text length: 174
 | > Min text length: 20
 | > Avg text length: 100.76335877862596
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 34739.0
 | > Avg audio length: 144033.41221374046
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.01945 [0m(-0.63476)
     | > avg_loss:[92m 3.21212 [0m(-0.18949)
     | > avg_log_mle:[92m 0.71648 [0m(-0.00567)
     | > avg_loss_dur:[92m 2.49564 [0m(-0.18382)

 > BEST MODEL : train/run-November-04-2022_10+17AM-0000000/best_model_812.pth


In [7]:
import glob, os

ckpts = sorted([f for f in glob.glob('train/*/*.pth')])
configs = sorted([f for f in glob.glob('train/*/*.json')])

test_ckpt = ckpts[0]
test_config = configs[-1]

ckpts, test_ckpt, test_config

(['train/run-November-04-2022_10+17AM-0000000/best_model.pth',
  'train/run-November-04-2022_10+17AM-0000000/best_model_812.pth'],
 'train/run-November-04-2022_10+17AM-0000000/best_model.pth',
 'train/run-November-04-2022_10+17AM-0000000/config.json')

In [9]:
!tts --text 'It must be remembered, however, that most modern printing is done by machinery on soft paper' \
    --model_path $test_ckpt --config_path $test_config --out_path out.wav

import IPython

IPython.display.Audio('out.wav')

 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Using Griffin-Lim as no vocoder model defined
 > Text: It must be remembered, however, that most modern printing is done by machinery on soft paper
 > Text splitted to sentences.
['It must be remembered, however, that most modern 