<a href="https://colab.research.google.com/github/korakoe/VALL-E-X/blob/main/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# @title Mount Drive
from google.colab import drive
drive.mount('/content/drive')

## Download Dependancies

In [None]:
# @title Clone Repo

!git clone https://github.com/korakoe/VALL-E-X-Trainer.git
%cd VALL-E-X-Trainer

In [None]:
# @title Install Requirements
!pip install -r requirements.txt
!pip install torchmetrics

# Create Dataset
Skip this section if you have already processed a dataset
<br>
<br>
Dataset should be formatted like this, creating the dataset will automatically transcribe it
```
data_dir
├── bpe_69.json
├── utt1.wav
├── utt2.wav
├── utt3.wav
......
└── utt{n}.wav
```

ALTERNATIVELY!
<br>
You can use a slightly modified (language tags) LJSpeech dataset format, this must only have the filepath and trancript... like so
```
wavs/1.wav|[EN]Hello World.[EN]
```



In [None]:
# @title Create Dataset (SLOW! - ONLY NEEDS TO BE DONE ONCE - LOAD FROM DRIVE IF POSSIBLE)
data_path = "/content/drive/MyDrive/test" # @param {type:"string"}

%cd VALL-E-X-Trainer

import wget
from customs.make_custom_dataset import create_dataset

'''
How should the data_dir be created?
Place the necessary audio files in data_dir.
Transcription, tokenization, etc. of the audio files are done by the create_dataset function.

data_dir
├── bpe_69.json
├── utt1.wav
├── utt2.wav
├── utt3.wav
......
└── utt{n}.wav
'''

wget.download("https://raw.githubusercontent.com/0417keito/VALL-E-X-Trainer-by-CustomData/master/utils/g2p/bpe_69.json", data_path)

create_dataset(data_path, dataloader_process_only=True)

In [None]:
# @title Create Dataset (LJSPeech | Considerably Faster)
import os
import wget

txt_path = "/content/drive/MyDrive/voice_clones/flawful/wavs.txt" # @param {type:"string"}

%cd VALL-E-X-Trainer

wget.download("https://raw.githubusercontent.com/0417keito/VALL-E-X-Trainer-by-CustomData/master/utils/g2p/bpe_69.json", os.path.dirname(txt_path))

from customs.make_custom_dataset import create_dataset_ljspeech

create_dataset_ljspeech(txt_path, dataloader_process_only=True)

# Begin Training

In [None]:
# @title Train Model
%cd VALL-E-X-Trainer

data_path = "/content/drive/MyDrive/voice_clones/flawful/" # @param {type:"string"}
validation_path = "/content/drive/MyDrive/voice_clones/flawful/" # @param {type:"string"}
dtype = "float16" # @param ["float32", "float16", "bfloat16"]
exp_dir = "/content/drive/MyDrive/valle/flawful" # @param {type:"string"}
model_name = "flawful" # @param {type:"string"}
epochs = 30 # @param {type:"integer"}
start_from_pretrained_checkpoint = 1 # @param {type:"integer"}
base_lr = 1e-4 # @param {type:"number"}
save_every_n_steps = 500 # @param {type:"integer"}
valid_every_n_steps = 500 # @param {type:"integer"}
keep_last = 3 # @param {type:"integer"}
grad_accum = 10 # @param {type:"integer"}
train_type = "both" # @param ["both", "AR", "NAR"]
max_duration = 120 # @param {type:"integer"}
max_size = 5 # @param {type:"integer"}

if train_type == "both":
  stage = 0
elif train_type == "AR":
  stage = 1
elif train_type == "NAR":
  stage = 2

!python train.py \
--train_dir $data_path \
--valid_dir $validation_path \
--model-name $model_name \
--exp-dir $exp_dir \
--dtype $dtype \
--num-epochs $epochs \
--start-epoch $start_from_pretrained_checkpoint \
--base-lr $base_lr \
--save-every-n $save_every_n_steps \
--valid-interval $valid_every_n_steps \
--keep-last-k $keep_last \
--accumulate-grad-steps $grad_accum \
--train-stage $stage \
--max-duration $max_duration \
--max-size $max_size
