<a href="https://colab.research.google.com/github/inokchoi/Speech/blob/main/tacotron_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

타코트론: 엔드 투 엔드

스펙트로그람 --> 타임 도메인 시그널 얻는 방법: 그리핀 림 리콘스트럭션

타코트론2 --> 출력 스펙트런 : 뉴럴 보코더(웨이브넷)

In [None]:
!pip install librosa         # Process audio
!pip install matplotlib      # Plot spectrogram and attention
!pip install numpy           # Fundamental package for scientific computing
!pip install scipy           # Use signal.lfilter
!pip install yaml            # Config parser
!pip install soundfile       # Save audio
!pip install tensorboardX    # Tensorboard for pytorch
!pip install tqdm            # Verbosity
!pip install torch           # Deep learning platform
!pip install inflect
!pip install unidecode
!pip install inflect

[31mERROR: Could not find a version that satisfies the requirement yaml (from versions: none)[0m
[31mERROR: No matching distribution found for yaml[0m
Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1
Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/35/f1/5843425495765c8c2dd0784a851a93ef204d314fc87bcc2bbb9f662a3ad1/tensorboardX-2.0-py2.py3-none-any.whl (195kB)
[K     |████████████████████████████████| 204kB 2.8MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.0
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |███████████████████████████████

In [None]:
import yaml
import torch
import random
#import argparse
import numpy as np
from src.dataset import MyDataset, collate_fn
from torch.utils.data import DataLoader
from src.module import Tacotron
import pdb
import os
import easydict # argparse is not available in jupyternotebook
from tqdm import tqdm

In [None]:
# Perform training.
def training(model, optim, dataloader, criterion, config, device, epoch):
    model.train()
    
    mel_loss_avg=0
    linear_loss_avg=0
    loss_avg=0
    fs = config['audio']['sample_rate']
    linear_dim = model.linear_size
    n_priority_freq = int(3000/(fs*0.5)*linear_dim)
    progress_bar = tqdm(enumerate(dataloader))
    for b_idx, (txt, text_lengths, mel, spec) in progress_bar:
        # Sort training data by legnth.
        sorted_lengths, indices = torch.sort(text_lengths.view(-1), dim=0, #입력 데이터 길이가 ㅇ차이가 많이 나면 seq 2 seq 모델이 학습이 잘 안된다. 그래서 길이가 유사한 것들끼리 모아서 트레이닝 하는게 효과적이다.
                                             descending=True)
        sorted_lengths = sorted_lengths.long().numpy()
        txt, mel, spec = txt[indices], mel[indices], spec[indices]

        # Move data to relevant device.
        txt = txt.to(device=device)
        mel = mel.to(device=device)
        spec = spec.to(device=device)

        # Perform a forward pass.
        optim.zero_grad()
        mel_outputs, linear_outputs, attn = model(txt, 
                                                  mel, 
                                                  text_lengths=sorted_lengths)
        
        # Calculate total loss by adding mel and linear output losses.        
        mel_loss = criterion(mel_outputs, mel)        #L1 loss
        linear_loss = 0.5 * criterion(linear_outputs, spec) \
                    + 0.5 * criterion(linear_outputs[:, :, :n_priority_freq], 
                                      spec[:, :, :n_priority_freq])
        loss = mel_loss + linear_loss  #mel이 있고, 이걸 linear로 바꾼다. (타코트론)
        
        # Perform backpropagation.
        loss.backward()
        
        mel_loss_avg += mel_loss.item()
        linear_loss_avg += linear_loss.item()
        loss_avg += loss.item()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),    #LSTM은 grad_norm을 쓴다. 커질 수 있으므로 클리핑을 해주는 것. (torch utility)
                                                   config['solver']['grad_clip'])

        progress_bar.set_description(
            'Train Epoch:{}[{}/{}] Mel_loss:{:.3f} Linear_loss:{:.3f} Total_loss:{:.3f}'\
            .format(epoch, b_idx, len(dataloader), mel_loss_avg/(b_idx+1),
                    linear_loss_avg/(b_idx+1), loss_avg/(b_idx+1)))
                                     
    return mel_loss_avg/len(dataloader), linear_loss_avg/len(dataloader), 
           loss_avg/len(dataloader)

# Perform validation.
def validate(model, optim, dataloader, criterion, config, device, epoch):
    model.encoder.eval()
    model.postnet.eval()

    mel_loss_avg=0
    linear_loss_avg=0
    loss_avg=0
    fs = config['audio']['sample_rate']
    linear_dim = model.linear_size
    n_priority_freq = int(3000/(fs*0.5)*linear_dim)
    progress_bar = tqdm(enumerate(dataloader))
    with torch.no_grad(): #gradient를 계산하지 않아도 된다. + 백프로파도...
        for b_idx, (txt, text_lengths, mel, spec) in progress_bar:
            # Sort data by legnth.
            sorted_lengths, indices = torch.sort(text_lengths.view(-1), dim=0, 
                                                 descending=True)
            sorted_lengths = sorted_lengths.long().numpy()
            txt, mel, spec = txt[indices], mel[indices], spec[indices]

            # Move data to relevant device.
            txt = txt.to(device=device)
            mel = mel.to(device=device)
            spec = spec.to(device=device)

            # Perform a forward pass.
            optim.zero_grad()
            mel_outputs, linear_outputs, attn = model(txt, 
                                                      mel, 
                                                      text_lengths=sorted_lengths)
            
            # Calculate total loss by adding mel and linear output losses.
            mel_loss = criterion(mel_outputs, mel)            
            linear_loss = 0.5 * criterion(linear_outputs, spec) \
                        + 0.5 * criterion(linear_outputs[:, :, :n_priority_freq], 
                                          spec[:, :, :n_priority_freq])
            loss = mel_loss + linear_loss
            
            mel_loss_avg += mel_loss.item()
            linear_loss_avg += linear_loss.item()
            loss_avg += loss.item()
            
            progress_bar.set_description(
                'Valid Epoch:{}[{}/{}] Mel_loss:{:.3f} Linear_loss:{:.3f} Total_loss:{:.3f}'\
                .format(epoch, b_idx, len(dataloader), mel_loss_avg/(b_idx+1),linear_loss_avg/(b_idx+1), loss_avg/(b_idx+1)))

    return mel_loss_avg/len(dataloader), linear_loss_avg/len(dataloader), 
           loss_avg/len(dataloader)

In [None]:
if __name__ == '__main__':  #제일 처음 동작하는 펑션이다.
    # Set environmental variables.
    args = easydict.EasyDict({
      'config':'./config/config.yaml',  #딕셔너리처럼 만들어놓은것.
      'log-dir':'.log/',
      'checkpoint_dir':'./ckpt',
      'checkpoint_model':'',
      'seed':0,  #random 발생시키는 루틴이 있다. 항상 똑같은 랜덤넘버가 계속 반복된다. 그래서 시드를 한다.
      'cpu':False,
    })
    args.gpu = not args.cpu
    if args.cpu:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda')
    
    config = yaml.load(open(args.config, 'r'), Loader=yaml.FullLoader)

    # Make a checkpoint directory.
    os.makedirs(args.checkpoint_dir, exist_ok=True)
    
    # Set random seed.
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
    
    ########## Define dataset###########
    # Set training and validation datasets.
    train_data = MyDataset(config['solver']['meta_path']['train'], 
                           config['solver']['data_dir'])
    train_loader = DataLoader(train_data, 
                              batch_size=config['solver']['batch_size'], shuffle=True,                              
                              drop_last=False, num_workers=config['solver']['n_jobs'],
                              collate_fn=collate_fn, pin_memory=args.gpu)
    
    valid_data = MyDataset(config['solver']['meta_path']['test'], 
                           config['solver']['data_dir'])
    valid_loader = DataLoader(valid_data, batch_size=config['solver']['batch_size'], 
                              shuffle=False, drop_last=False, 
                              num_workers=config['solver']['n_jobs'],
                              collate_fn=collate_fn, pin_memory=args.gpu)                          
    
    # Define a Tacotron model.
    model = Tacotron(**config['model']['tacotron']).to(device)
    
    # Set error criterion.
    criterion = torch.nn.L1Loss()
    
    # Set an optimizer.
    optim = torch.optim.Adam(model.parameters(), 
                             lr=config['model']['optimizer']['lr'])
#    lr_rate = lambda epoch: 0.95
#   scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optim, lr_lambda=lr_rate)  #pytorch 1.5여야 된다.
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma = 0.95)  #버젼이 낮음.
    
    # Load a pre-trained model or start training again.
    if os.path.isfile(os.path.join(args.checkpoint_dir, args.checkpoint_model)):
        checkpoint = torch.load(os.path.join(args.checkpoint_dir, 
                                             args.checkpoint_model), 
                                map_location=device)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start = checkpoint['epoch']
        print('----loadeing checkpoint----')
    else:
        print('can not find the resume model.')
        start=0        
    
    prev_loss = 1e10
    flag = 0
    for i in range(start,config['solver']['epochs']):
        # Perform training and save the trained model.
        tr_mel_loss, tr_lin_loss, tr_loss = training(model, optim, train_loader, 
                                                     criterion, config, device, i)        
        torch.save({'epoch':i, 'state_dict':model.state_dict(),
                    'optimizer':optim.state_dict()},
                    '{}/tacotron_{}.pth'.format(args.checkpoint_dir, i))
        
        # Perform validatation, and save it to the optimal model if loss reduces.
        val_mel_loss, val_lin_loss, val_loss = validate(model, optim, valid_loader, 
                                                        criterion, config, device, i)        
        if val_loss < prev_loss:
            prev_loss = val_loss
            torch.save({'epoch':i, 'state_dict':model.state_dict(),
                        'optimizer':optim.state_dict()},
                        '{}/optimal_{}.pth'.format(args.checkpoint_dir, i))
            flag=0
        else:
            flag += 1
    
        # Adjust the learning rate if loss does not improve three consecutive epochs.
        if flag == 3:
            scheduler.step()
            lr = optim.param_groups[0]['lr']
            print(lr)