In [None]:
#connect google drive
%cd /content/
from google.colab import drive
drive.mount('drive', force_remount=True)


Ensure you are connected to a GPU runtime

In [None]:
#get tacotron2 from github
%cd /content
!git clone 'https://github.com/josha00/tacotron2.git' tacotron2
%cd /content/tacotron2
!git checkout josha00-patch-1
!git submodule init
!git submodule update

#Install requirements
!pip install -r requirements.txt
import os
import time
import argparse
import math
from numpy import finfo
import numpy as np
import random
random.seed(0)
import matplotlib.pyplot as plt
from google.colab import files
import shutil
import numpy as np

import torch
from distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

from model import Tacotron2
from data_utils import TextMelLoader, TextMelCollate
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger
from hparams import create_hparams

In [None]:
#get pretrained tacotron model to use as starting point (note: this one is the LJ pretrained model)
import gdown
gdown.download('https://drive.google.com/uc?id=1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA', '/content/tacotron2/pretrained_model', quiet=False)


In [None]:
#upload audio files for training

os.chdir('/content/tacotron2')

from google.colab import files
import shutil

if os.path.exists('/content/tacotron2/wavs'):
  shutil.rmtree('wavs')

audio_path = "/content/drive/MyDrive/_/wavs/."
audio_to = "/content/tacotron2/wavs"
shutil.copytree(audio_path, audio_to)

In [None]:
  #upload ONLY SOME audio files for training
os.chdir('/content/tacotron2')

upload_some=True #set to true if want this

percent=.5     #percentage of the audio files you want to upload

if upload_some:
    !pip install tqdm
    from tqdm import tqdm

    audio_path = "/content/drive/MyDrive/_/wavs/"
    audio_to = "/content/tacotron2/wavs"
    audio_files = os.listdir(audio_path)
    audio_files.sort()

    # Select some files only
    selected_files=audio_files

    split_point=round(len(audio_files)*percent)  #amount of audio files you want to upload

    random.shuffle(selected_files)
    selected_files=selected_files[:split_point]
    selected_files.sort()

    os.makedirs(audio_to, exist_ok=True)

    for file_name in tqdm(selected_files, desc="Copying Files", unit="file"):
        source_file_path = os.path.join(audio_path, file_name)
        dest_file_path = os.path.join(audio_to, file_name)
        shutil.copy2(source_file_path, dest_file_path)

    print("Uploaded audio files to", audio_to)
    print('audio files:',len(os.listdir('wavs')))

In [None]:
#change audio bits/sample and sample rate if needed

change=False     #set to true if needed

def convert_audio(input_file_path, output_file_path):
    !sox -v 0.95 "$input_file_path" -r 22050 -b 16 "$output_file_path" #reduce 0.95 further if get clipping issues

def process_audio_files(input_dir, output_dir):

    audio_files = os.listdir(input_dir)
    audio_files.sort()
    selected_files=audio_files
    selected_files.sort()
    amo=len(selected_files)

    for index, file_name in enumerate(selected_files):
        input_file_path = os.path.join(input_dir, file_name)
        output_file_path = os.path.join(output_dir, file_name)
        convert_audio(input_file_path, output_file_path)
        print(f"file {index} of {amo} format changed")

input_directory = '/content/tacotron2/wavs'
output_directory = '/content/tacotron2/wavs'

if not os.path.exists(output_directory):
      os.makedirs(output_directory)

if change:
    process_audio_files(input_directory, output_directory)


In [None]:
#TRIM OUT SILENCE

!pip install pydub
from pydub import AudioSegment
from pydub.silence import detect_leading_silence, detect_leading_silence

def trim_start_and_end(audio_path, output_path, silence_threshold,silence_threshold2):

    audio = AudioSegment.from_file(audio_path, format="wav")

    # Find the start and end of non-silent sections
    start_trim = detect_leading_silence(audio, silence_threshold)
    end_trim = detect_leading_silence(audio.reverse(), silence_threshold2)

    # Calculate the new audio with trimmed start and end
    trimmed_audio = audio[start_trim:len(audio)-end_trim]

    # Export the trimmed audio to the specified output path
    trimmed_audio.export(output_path, format="wav")

input_folder = 'wavs'
output_folder = 'wavs'

# Call the trimming function for each file
for filename in os.listdir(input_folder):
  if filename.endswith(".wav"):
        input_audio_path = os.path.join(input_folder, filename)
        output_audio_path = os.path.join(output_folder, filename)
        trim_start_and_end(input_audio_path, output_audio_path, -40,-50)    #increasing (closer to 0) cuts more audio

print("Trimming complete.")

In [None]:
#get audio duration
import librosa

folder_path = "wavs"
total_duration = 0

for filename in os.listdir(folder_path):
    if filename.endswith(".wav"):
        audio_file = os.path.join(folder_path, filename)
        y, sr = librosa.load(audio_file)
        duration = librosa.get_duration(y=y, sr=sr)
        total_duration += duration

print(f"Total Duration of Audio Clips: {total_duration/60} minutes")


In [None]:
#Convert audio to mels (code modified from data_utils.py)

##################################################

meldirect=False   #set to true to get mels from files directly in colab, rather than copying files to colab first
wav_location = '/content/drive/MyDrive/wavs'

##################################################

import layers
from utils import load_wav_to_torch, load_filepaths_and_text
hparams = create_hparams()

stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)

def get_mel(filename):
  audio, sampling_rate = load_wav_to_torch(filename)
  if sampling_rate != stft.sampling_rate:
    raise ValueError("{} {} SR doesn't match target {} SR".format(
  sampling_rate, stft.sampling_rate))
  audio_norm = audio / hparams.max_wav_value
  audio_norm = audio_norm.unsqueeze(0)
  audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
  melspec = stft.mel_spectrogram(audio_norm)
  melspec = torch.squeeze(melspec, 0).cpu().numpy()
  return melspec

if meldirect:
  wav_dir=wav_location
  print(wav_location)
else:
  wav_dir='/content/tacotron2/wavs'

if not os.path.exists('/content/tacotron2/wavs'):
        os.makedirs('/content/tacotron2/wavs')

wav_files = [f for f in os.listdir(wav_dir) if f.endswith('.wav')]
for wf in wav_files:
  input_path = os.path.join(wav_dir, wf)
  mel_spectogram = get_mel(input_path)
  output_path = os.path.join('/content/tacotron2/wavs', os.path.splitext(wf)[0] + '.npy')
  np.save(output_path, mel_spectogram)

os.chdir('/content/tacotron2')

In [None]:
#upload transcript (in format name.npy|transcription)

%cd /content/tacotron2/filelists/
if os.path.exists('list.txt'):
  !rm list.txt

transcript_path = "/content/drive/MyDrive/_/list.txt"
copy_to = "/content/tacotron2/filelists"

shutil.copy(transcript_path, os.path.join(copy_to, 'list.txt'))
%cd ..

In [None]:
#delete any file lines that have no matching audio

def delete_lines(transcript):
  with open(transcript, "r") as f:
    lines = f.readlines()

  adjusted_lines = []
  for line in lines:
      path_to_audio = '/content/tacotron2/'+ line.split('|')[0][:-4] + '.wav'

      if os.path.exists(path_to_audio):
          adjusted_lines.append(line)
      if not os.path.exists(path_to_audio):
          print(path_to_audio)

  with open(transcript, "w") as f:
    f.writelines(adjusted_lines)

delete_lines("/content/tacotron2/filelists/list.txt")


In [None]:
#split files into train and validate

train_amount=.5      #set what percentage of the dataset you want for training

with open('/content/tacotron2/filelists/list.txt', 'r') as list_file:
  wav_files = [line.strip() for line in list_file]

random.shuffle(wav_files)
train_to=round(len(wav_files)*train_amount)


# Create train_files.txt
train_files = wav_files[:train_to]
train_files.sort()
with open(os.path.join('filelists', 'train_files.txt'), 'w') as f_train:
    for filename in train_files:
        f_train.write(os.path.join(filename) + '\n')

# Create val_files.txt
val_files = wav_files[train_to:]
val_files.sort()
with open(os.path.join('filelists', 'val_files.txt'), 'w') as f_val:
    for filename in val_files:
        f_val.write(os.path.join(filename) + '\n')


In [None]:
#get number of training clips

with open('filelists/train_files.txt', 'r') as file:
    lines = file.readlines()
num_train = len(lines)
print(f"Number of training clips: {num_train}")


In [None]:
#get training audio duration
import librosa

total_duration = 0

with open('/content/tacotron2/filelists/train_files.txt', 'r') as file:
  train_files_content = file.read()

for filename in os.listdir("wavs"):
  if filename.endswith(".wav"):
      if filename[:-4] in train_files_content:
        audio_file = os.path.join('/content/tacotron2/wavs', filename)
        y, sr = librosa.load(audio_file)
        duration = librosa.get_duration(y=y, sr=sr)
        total_duration += duration

print(f"Total Duration of Training Audio Clips: {total_duration/60} minutes")

In [None]:
#edited version of train.py code
import os
import time
import argparse
import math
from numpy import finfo
import random
import matplotlib.pyplot as plt

import torch
from distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

from model import Tacotron2
from data_utils import TextMelLoader, TextMelCollate
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger
from hparams import create_hparams


def reduce_tensor(tensor, n_gpus):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= n_gpus
    return rt


def init_distributed(hparams, n_gpus, rank, group_name):
    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
    print("Initializing Distributed")

    # Set cuda device so everything is done on the right GPU.
    torch.cuda.set_device(rank % torch.cuda.device_count())

    # Initialize distributed communication
    dist.init_process_group(
        backend=hparams.dist_backend, init_method=hparams.dist_url,
        world_size=n_gpus, rank=rank, group_name=group_name)

    print("Done initializing distributed")


def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    valset = TextMelLoader(hparams.validation_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size, pin_memory=False,
                              drop_last=True, collate_fn=collate_fn)
    return train_loader, valset, collate_fn


def prepare_directories_and_logger(output_directory, log_directory, rank):
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
    else:
        logger = None
    return logger


def load_model(hparams):
    model = Tacotron2(hparams).cuda()
    if hparams.fp16_run:
        model.decoder.attention_layer.score_mask_value = finfo('float16').min

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return model


def warm_start_model(checkpoint_path, model, ignore_layers):
    assert os.path.isfile(checkpoint_path)
    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model_dict = checkpoint_dict['state_dict']
    if len(ignore_layers) > 0:
        model_dict = {k: v for k, v in model_dict.items()
                      if k not in ignore_layers}
        dummy_dict = model.state_dict()
        dummy_dict.update(model_dict)
        model_dict = dummy_dict
    model.load_state_dict(model_dict)
    return model


def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path)
    print("Loading checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint_dict['state_dict'])
    optimizer.load_state_dict(checkpoint_dict['optimizer'])
    learning_rate = checkpoint_dict['learning_rate']
    iteration = checkpoint_dict['iteration']
    print("Loaded checkpoint '{}' from iteration {}" .format(
        checkpoint_path, iteration))
    return model, optimizer, learning_rate, iteration


def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model at iteration {} to {}".format(
        iteration, filepath))
    torch.save({'iteration': iteration,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)


def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank, epoch, start_eposh, learning_rate,train_loss):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, collate_fn=collate_fn)

        val_loss = 0.0
        counti=0
        for i, batch in enumerate(val_loader):
            counti+=1
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (counti)

    model.train()
    if rank == 0:
      print("Epoch: {} Validation loss {}: {:9f}  Time: {:.1f}m LR: {:.6f}".format(epoch, iteration, val_loss,(time.perf_counter()-start_eposh)/60, learning_rate))
      logger.log_validation(val_loss, model, y, y_pred, iteration)



      %matplotlib inline
      _, mel_outputs, gate_outputs, alignments = y_pred
      idx = random.randint(0, alignments.size(0) - 1)
      alignment=(alignments[idx].data.cpu().numpy().T)
      fig, ax = plt.subplots(figsize=(5, 3))
      im = ax.imshow(alignment, cmap='inferno', aspect='auto', origin='lower',interpolation='none')
      plt.tight_layout()
      fig.canvas.draw()
      plt.show()

      #save to outputs to another folder
      import csv
      csv_file_path = os.path.join(output_directory,'val_loss.csv')
      loss_entry = [epoch, val_loss,train_loss,iteration,(time.perf_counter()-start_eposh)/60,learning_rate]
      with open(csv_file_path, 'a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        if os.path.getsize(csv_file_path) == 0:
          csv_writer.writerow(["Epoch", "Validation Loss", "Training Loss","Iteration", "Time","Learning Rate"])
        csv_writer.writerow(loss_entry)
        csv_file.flush()

      alignments_dir = os.path.join(output_directory, 'alignments')
      os.makedirs(alignments_dir, exist_ok=True)
      file_name = f'alignment_epoch_{epoch}.png'
      image_path = os.path.join(alignments_dir, file_name)
      fig.savefig(image_path)


        #print("Validation loss {}: {:9f}  ".format(iteration, val_loss))    original code
        #logger.log_validation(val_loss, model, y, y_pred, iteration)


def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams, save_interval):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None and os.path.isfile(checkpoint_path):

            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
    else:
      if warm_start:
        model = warm_start_model("/content/tacotron2/pretrained_model", model, hparams.ignore_layers)

    start_eposh = time.perf_counter()
    model.train()
    is_overflow = False

    if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
    if not os.path.isdir(log_directory):
            os.makedirs(log_directory)


    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        train_loss = 0
        icount=0
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            icount+=1
            start = time.perf_counter()
            learning_rate=getLR(epoch)
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)
                print("Batch {} loss {:.6f} Grad Norm {:.6f} Time {:.6f}".format(iteration, reduced_loss, grad_norm, duration), end='\r', flush=True)

            train_loss += reduced_loss

            iteration += 1

        train_loss = train_loss/(icount)

        validate(model, criterion, valset, iteration,
                 hparams.batch_size, n_gpus, collate_fn, logger,
                 hparams.distributed_run, rank, epoch, start_eposh, learning_rate,train_loss)


        if (epoch+1) % save_interval == 0 or (epoch+1) == hparams.epochs:
            save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path + 'epoch' + str(epoch+1))

hparams = create_hparams()
torch.backends.cudnn.enabled = hparams.cudnn_enabled
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark


In [None]:
#set parameters

hparams = create_hparams()
hparams.training_files ="filelists/train_files.txt"
hparams.validation_files = "filelists/val_files.txt"
hparams.p_attention_dropout=0.1
hparams.p_decoder_dropout=0.1
hparams.load_mel_from_disk = True
hparams.ignore_layers = []
n_gpus=1
rank=0
group_name=None

################################################################
def getLR(epoch):
      if epoch<=50:
        LR=5e-5
      elif epoch>50 and epoch<=100:
        LR=2e-5
      else:
        LR=1e-5
      return LR

model_name='NAME'    #set name of model
warm_start=True   #if true, train from pretrained model starting point
save_interval = 50 #how often you want to save a copy of the model
hparams.epochs = 200
hparams.batch_size = min(num_train,1)
################################################################


output_directory = os.path.join('/content/drive/MyDrive/colab/', model_name)
log_directory = os.path.join(output_directory, 'Logs')
checkpoint_path = output_directory+(r'/')+'model'

note: if you want to continue training from a saved checkpoint, eg from epoch 50:
* in your output directory you will have a model file called 'NAMEepoch50'.
* rename this to 'model' and your training will now start from this checkpoint.


In [None]:
#RUN TRAINING

train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams,save_interval)