# notes: TESTING 
- before running, be sure to have run through the quality.py file that exists in the digitalclone-backend to go through and convert the file types, and remove the white noise 
- the main folder which contains the output - ready files should be: 
    - `..../digitalclone-backend/aws_jobs_voiceclone/tts_tests/testdata2/clean_quality` 

## trying on dec 28 2022 
- with a few hundred audio samples from < 100 and > 200 
- 176 audio samples


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
!git clone https://github.com/hantswilliams/digitalclone-backend.git
!cd digitalclone-backend/ && git pull

In [None]:
!pip install tts
!tts

In [None]:
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.config.shared_configs import BaseAudioConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from trainer import Trainer, TrainerArgs
from TTS.tts.datasets import load_tts_samples

In [None]:
location_metaData = "/content/digitalclone-backend/aws_jobs_voiceclone/tts_tests/testdata2/metaData_list_all.txt"
dataset_path = "/content/digitalclone-backend/aws_jobs_voiceclone/tts_tests/testdata2/clean_quality/"
output_path = "/content/drive/MyDrive/VoiceCloning/output"

In [None]:
tpower = 1.3
tpreemphasis = 0.98
tdb = 20
######################


dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train=location_metaData, path=os.path.join(output_path, dataset_path)
)

audio_config = BaseAudioConfig(
    sample_rate=16000, 
    win_length=1024, 
    hop_length=256, 
    num_mels=80, 
    mel_fmin=0, 
    mel_fmax=None, 
    power=tpower,
    preemphasis=tpreemphasis,
    ref_level_db=tdb
)

config = VitsConfig(
    audio=audio_config,
    run_name="vits_ljspeech",
    batch_size=32,
    eval_batch_size=16,
    batch_group_size=5,
    num_loader_workers=8,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=5000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    cudnn_benchmark=False
)

In [None]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    # eval_split_size=config.eval_split_size
    eval_split_size=0.1
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)
model = Vits(config, ap, tokenizer, speaker_manager=None)

In [None]:
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

trainer.fit()

In [None]:
## test it!

In [None]:
# !tts --text "this is my new voice, lets see how it works. Maybe it does not sound like me." \
#       --model_path '/content/drive/MyDrive/VoiceCloning/output/vits_ljspeech-December-17-2022_08+58PM-0000000/best_model_3144.pth' \
#       --config_path '/content/drive/MyDrive/VoiceCloning/output/vits_ljspeech-December-17-2022_08+58PM-0000000/config.json' \
#       --out_path 'out.wav'