Intunist's notebook for NNSVS rapid prototyping. **_(Kaggle/Google Cloud version)_**
<br/>Intended for team use only. **No external support** provided if anything is non-functional.

Copying and modifying this notebook is **not permitted**.

TEAM MEMBERS: **commit your version and the model will be ready to download after execution finish**

In [None]:
#Install NNSVS
%mkdir /content
%cd -q /content
!git clone https://github.com/intunist/ETK.git ETK &> /dev/null
!bash /content/ETK/train/install.sh
!ln -sf /content/ETK/train/conf/train/acoustic/model/acoustic_conv1dresnet.yaml /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml &> /dev/null
!ln -sf /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml ACOUSTIC_SETTINGS.yaml &> /dev/null

In [None]:
#Download file from google drive
%cd /content
!gdown https://drive.google.com/uc?id=

In [None]:
#Decompress dataset
empty_dataset_folder = False 
compressed_dataset_path = "/content/dataset.*" 

if empty_dataset_folder == True:
  !rm -rf /content/ETK/singing_database

!7za  -bso0 -y x "$compressed_dataset_path" -o/content/ETK/singing_database/
print('\nDone!')

In [None]:
#Alternative: Extract dump.
#To save time, you can do feature extraction (stage 1) and unpack it.
#ResF0 and non-ResF0 models need separate dumps.
compressed_dump_path = "/content/dump.*"
#Files will be extracted to `ETK/train/dump/`. The archived folder needs to match the singer name.

!7za  -bso0 -y x "$compressed_dump_path" -o/content/ETK/train/dump/
print('\nDone!')

#Configure NNSVS
---

In [None]:
#@Change Language and model
# Changing these settings will require you to run feature generation again.

lang = "Japanese" # can be "Japanese", "English", "Romance", "Polish", or "Custom"
singer_name = "Unnamed_Singer"
# No spaces in singer_name. Use underscores `_`.

# custom language settings
custom_hed = "custom.hed"
custom_table = "custom.table"

model = "Conv1dResnet" # "Conv1dResnet", "ResF0Conv1dResnet", "ResSkipF0FFConvLSTM", "ResF0VariancePredictor", "ResF0NonAttentiveTacotron"
use_mdn = False
# No MDN for tacotron. Don't use below 1hr of data.

#only suppoort sine vibrato
vibrato = "none" # "none" or "sine". no "diff" support.
# might require ResF0

sample_rate = "44100" # "44100" or "48000"

d4c_threshold = 0.25 # 0.00 - 1.00

# so the notebook is more likely to work on Jupyter.
import importlib
nnsvs_path = importlib.util.find_spec("nnsvs")
nnsvs_path = nnsvs_path.submodule_search_locations[0]
print("NNSVS location: " + nnsvs_path)

def get_vowels():
  import subprocess
  vowels = subprocess.getoutput('sed -n -r \'s/^.*("C-Vowel"|"C-Vowels"|"C-Phone_Boin")\s+//p\' /content/ETK/train/hed/'+hed_file)
  junk = ['{', '}', '-', '+', '*']
  table = str.maketrans('', '', ''.join(junk))
  vowels = vowels.translate(table)
  vowels = vowels.replace(",","', '")
  print("Vowels: " + "'" + vowels + "'")
  !sed -i -r "s|(VOWELS = )(.+)|\1'{vowels}'|g" /content/ETK/train/stage0/compare_mono_align_and_mono_score.py
    
def get_in_dim():
    in_dim = 0
    with open("/content/ETK/train/hed/"+hed_file,'r') as site:
        for line in site.readlines():
            if line.startswith(('QS', 'CQS')):
                in_dim = in_dim + 1          
    acoustic_in_dim = in_dim + int(4)
    print("in_dim:", + in_dim)
    print("acoustic_in_dim:", + acoustic_in_dim)
    !sed -i -r 's|(in_dim:)(\s+)(.+)|\1\2{in_dim}|g' /content/ETK/train/conf/train/timelag/model/timelag_mdnv2.yaml
    !sed -i -r 's|(in_dim:)(\s+)(.+)|\1\2{in_dim}|g' /content/ETK/train/conf/train/duration/model/duration_mdnv2.yaml
    !sed -i -r 's|(in_dim:)(\s+)(.+)|\1\2{acoustic_in_dim}|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml

def get_idx():
    from nnmnkwii.io import hts
    binary_dict, continuous_dict = hts.load_question_set("/content/ETK/train/hed/" + hed_file)
    for n in range(len(binary_dict)):
        if binary_dict[n][0] in ("C-Silence", "C-Silences", "C-Phone_Muon"):
            in_rest_idx = n
            print("in_rest_idx:", in_rest_idx)
            !sed -i -r 's|(in_rest_idx:)(\s+)(.+)|\1\2{in_rest_idx}|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml
    for n in range(len(continuous_dict)):
        if continuous_dict[n][0] in ("e1", "e1_absolute_pitch"): # the absolute pitch of the current note 
            in_lf0_idx = n+  len(binary_dict)
            print("in_lf0_idx:", in_lf0_idx)
            !sed -i -r 's|(in_lf0_idx:)(\s+)(.+)|\1\2{in_lf0_idx}|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
            !sed -i -r 's|(in_lf0_idx:)(\s+)(.+)|\1\2{in_lf0_idx}|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml

!ln -sf /content/ETK/train/conf/train/acoustic/model/acoustic_{model.lower()}.yaml /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml &> /dev/null

#These will pull the "main" branch for each language to reduce maintanance.
if lang == "Japanese":
    print("Switching to Intunist Japanese")
    hed_file = "intunist_jp.hed"
    dic_file = "intunist_jp.table"
elif lang == "English":
    print("Switching to Intunist English")
    !git clone https://github.com/intunist/nnsvs-english-support lang-ext &> /dev/null
    hed_file = "intunist_en.hed"
    dic_file = "blank.table"
elif lang == "Romance":
    print("Switching to Legacy Romance Family Support")
    !git clone https://github.com/DYVAUX/nnsvs-romance-language-support lang-ext &> /dev/null
    hed_file = "DVX_lat.hed"
    dic_file = "blank.table"
elif lang == "CJK":
    print("Switching to Intunist CJK")
    hed_file = "intunist_cjk.hed"
    dic_file = "intunist_cjk.table"
    print("SELECTED LANGUAGE IS UNAVAILABLE")
elif lang == "Chinese":
    print("Switching to Archivoice Chinese")
    !git clone https://github.com/Archivoice/nnsvs-chinese-support lang-ext &> /dev/null
    hed_file = "chinese.hed"
    dic_file = "chinese.table"
elif lang == "Polish":
    print("Switching to SzTP Polish")
    !git clone https://github.com/SzopaTatyJarka/nnsvs-polish-support lang-ext &> /dev/null
    hed_file = "sztj_polish.hed"
    dic_file = "sztj_polish_phoneme.table"
elif lang == "Custom":
    print("Switching to Custom Language")
    hed_file = custom_hed
    dic_file = custom_table
!cp -r lang-ext/hed /content/ETK/train &> /dev/null
!cp -r lang-ext/dic /content/ETK/train &> /dev/null
!rm -rf lang-ext
    
get_in_dim()
if lang == 'Japanese':
  vowels = "'a', 'i', 'u', 'e', 'o', 'A', 'I', 'U', 'E', 'O', 'N'"
  print("Vowels: " + vowels)
  !sed -i -r "s|(VOWELS = )(.+)|\1{vowels}|g" /content/ETK/train/stage0/compare_mono_align_and_mono_score.py
#elif lang == 'Custom':
#  print("Vowels: " + vowels)
#  !sed -i -r "s|(VOWELS = )(.+)|\1{vowels}|g" /content/ETK/train/stage0/compare_mono_align_and_mono_score.py
else:
  get_vowels()

print("Acoustic model: " + model)
if model.startswith('Res'):
    get_idx()
    !sed -i -r 's|(relative_f0:)(\s+)(.+)|\1\2false|g' /content/ETK/train/enuconfig.yaml
    !sed -i -r 's|(relative_f0:)(\s+)(.+)|\1\2false|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml
    !sed -i -r 's|(out_lf0_idx:)(\s+)(.+)|\1\2180|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml
    # change to nnsvs-train-resf0
else:
    # Set relative_f0 true static_deltadelta_* and enuconfig
    # change to nnsvs-train
    # set idx to null
    !sed -i -r 's|(in_rest_idx:)(\s+)(.+)|\1\2null|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml
    !sed -i -r 's|(in_lf0_idx:)(\s+)(.+)|\1\2null|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
    !sed -i -r 's|(in_lf0_idx:)(\s+)(.+)|\1\2null|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml
    !sed -i -r 's|(out_lf0_idx:)(\s+)(.+)|\1\2null|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml

#set vibrato
print("Vibrato Mode: "+ str(vibrato))
if vibrato == "none":
  !sed -i -r 's|(stream_sizes:)(\s+)(.+)|\1\2\[180, 3, 1, 15\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(has_dynamic_features:)(\s+)(.+)|\1\2\[true, true, false, true\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(out_dim:)(\s+)(.+)|\1\2199|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(vibrato_mode:)(\s+)(.+)|\1\2none|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml
  !sed -i -r 's|(acoustic_features:)(\s+)(.+)|\1\2static_deltadelta|g' /content/ETK/train/config.yaml
elif vibrato == "diff":
  !sed -i -r 's|(stream_sizes:)(\s+)(.+)|\1\2\[180, 3, 1, 15, 3\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(has_dynamic_features:)(\s+)(.+)|\1\2\[true, true, false, true, true\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(out_dim:)(\s+)(.+)|\1\2202|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(vibrato_mode:)(\s+)(.+)|\1\2diff|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml
  !sed -i -r 's|(acoustic_features:)(\s+)(.+)|\1\2static_deltadelta_diffvib|g' /content/ETK/train/config.yaml
elif vibrato == "sine":
  !sed -i -r 's|(stream_sizes:)(\s+)(.+)|\1\2\[180, 3, 1, 15, 6, 1\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(has_dynamic_features:)(\s+)(.+)|\1\2\[true, true, false, true, true, false\]|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(out_dim:)(\s+)(.+)|\1\2206|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
  !sed -i -r 's|(vibrato_mode:)(\s+)(.+)|\1\2sine|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml
  !sed -i -r 's|(acoustic_features:)(\s+)(.+)|\1\2static_deltadelta_sinevib|g' /content/ETK/train/config.yaml

print("MDN: "+ str(use_mdn))
if use_mdn:
  !sed -i -r 's|(use_mdn:)(\s+)(.+)|\1\2true|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml
else:
  !sed -i -r 's|(use_mdn:)(\s+)(.+)|\1\2false|g' /content/ETK/train/conf/train/acoustic/model/acoustic_current.yaml

#set samplerate
print("Sample Rate: "+ str(sample_rate))
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' /content/ETK/train/config.yaml
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' /content/ETK/train/enuconfig.yaml
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' /content/ETK/train/conf/train/acoustic/data/myconfig.yaml
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' /content/ETK/train/conf/train_postfilter/data/myconfig.yaml
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' /content/ETK/train/enuconfig.yaml
!sed -i -r 's|(sample_rate:)(\s+)(.+)|\1\2{sample_rate}|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml

#set d4c
print("d4c threshold: "+ str(d4c_threshold))
!sed -i -r 's|(d4c_threshold:)(\s+)(.+)|\1\2{d4c_threshold}|g' {nnsvs_path}/bin/conf/prepare_features/acoustic/static_*.yaml


!sed -i -r 's|(question_path:)(\s+)(.+)|\1\2hed/{hed_file}|g' /content/ETK/train/enuconfig.yaml
!sed -i -r 's|(table_path:)(\s+)(.+)|\1\2dic/{dic_file}|g' /content/ETK/train/config.yaml
!sed -i -r 's|(question_path:)(\s+)(.+)|\1\2hed/{hed_file}|g' /content/ETK/train/config.yaml
!sed -i -r 's|(table_path:)(\s+)(.+)|\1\2dic/{dic_file}|g' /content/ETK/train/enuconfig.yaml


In [None]:
# Set parameters
# Commiting to kaggles doesn't allow you to edit settings. Set in this cell.
# These Settings here can be changed without re-running feature generation.

pretrained_expdir = ""
#Ex: `exp/singer_name/`.

disable_checkpoints = True

acoustic_epochs = 80
acoustic_loss = "mae" # "mse" or "mae"

# The notebook uses MDNv2 by default with optimal settings. Choose desired number of epochs.
duration_epochs = 200
timelag_epochs = 200 

pitch_reg_weight = 0.1

!sed -i -r '/#/!s|(pretrained_expdir:)(.*)|\1 "{pretrained_expdir}"|g' /content/ETK/train/config.yaml

#set the epoch count
import math
!sed -i -r 's|(nepochs:)(\s+)(.+)|\1\2{acoustic_epochs}|g' /content/ETK/train/conf/train/acoustic/train/myconfig.yaml
acoustic_checkpoint_interval = acoustic_epochs * 0.15
acoustic_checkpoint_interval_trunc = math.trunc(acoustic_checkpoint_interval)
acoustic_checkpoint_interval_trunc_min = min(acoustic_checkpoint_interval_trunc, 80)
if disable_checkpoints:
  acoustic_checkpoint_interval = 999999
!sed -i -r 's|(checkpoint_epoch_interval:)(\s+)(.+)|\1\2{acoustic_checkpoint_interval_trunc_min}|g' /content/ETK/train/conf/train/acoustic/train/myconfig.yaml
#
!sed -i -r 's|(nepochs:)(\s+)(.+)|\1\2{duration_epochs}|g' /content/ETK/train/conf/train/duration/train/myconfig.yaml
duration_checkpoint_interval = duration_epochs * 0.15
duration_checkpoint_interval_trunc = math.trunc(duration_checkpoint_interval)
duration_checkpoint_interval_trunc_min = min(duration_checkpoint_interval_trunc, 80)
!sed -i -r 's|(checkpoint_epoch_interval:)(\s+)(.+)|\1\2{duration_checkpoint_interval_trunc_min}|g' /content/ETK/train/conf/train/duration/train/myconfig.yaml
#
!sed -i -r 's|(nepochs:)(\s+)(.+)|\1\2{timelag_epochs}|g' /content/ETK/train/conf/train/timelag/train/myconfig.yaml
timelag_checkpoint_interval = timelag_epochs * 0.15
timelag_checkpoint_interval_trunc = math.trunc(timelag_checkpoint_interval)
timelag_checkpoint_interval_trunc_min = min(timelag_checkpoint_interval_trunc, 80)
!sed -i -r 's|(checkpoint_epoch_interval:)(\s+)(.+)|\1\2{timelag_checkpoint_interval_trunc_min}|g' /content/ETK/train/conf/train/timelag/train/myconfig.yaml

!sed -i -r 's|(feats_criterion:)(\s+)(.+)|\1\2{acoustic_loss}|g' /content/ETK/train/conf/train/acoustic/train/myconfig.yaml
!sed -i -r 's|(pitch_reg_weight:)(\s+)(.+)|\1\2{pitch_reg_weight}|g' /content/ETK/train/conf/train/acoustic/train/myconfig.yaml

print("Values set")

#Training Steps
---

In [None]:
#@title Main Training

starting_stage = 0
stopping_stage = 5

!sed -i -r 's|(spk:)(\s+)(.+)|\1\2{singer_name}|g' /content/ETK/train/config.yaml
!sed -i -r 's|(model_dir:)(\s+)(.+)|\1\2exp/{singer_name}\_intunist_prototyping_notebook|g' /content/ETK/train/enuconfig.yaml
!sed -i -r 's|(stats_dir:)(\s+)(.+)|\1\2dump/{singer_name}/norm|g' /content/ETK/train/enuconfig.yaml

%cd "/content/ETK/train"
if model.startswith('Res'):
    !bash run_resf0.sh --stage $starting_stage --stop_stage $stopping_stage
else:
    !bash run.sh --stage $starting_stage --stop_stage $stopping_stage

#Advanced Training
---

In [None]:
# Train mgc and bap postfilters
# The postfilters need to be trained separately. Stage 6 only needs to be run once.
starting_stage = 0 # set to 6
stopping_stage = 0 # set to 8

%cd -q "/content/ETK/train"
if starting_stage <= 6 and stopping_stage >= 6:
  !bash run.sh --stage 6 --stop_stage 6 --acoustic-model acoustic
if starting_stage <= 7 and stopping_stage >= 7:
  print("training MGC postfilter for " + singer_name + "\n")
  !bash run.sh --stage 7 --stop_stage 7 --acoustic-model acoustic --postfilter-model postfilter_mgc --postfilter-train mgc
if starting_stage <= 8 and stopping_stage >= 8:
  print("training BAP postfilter for " + singer_name + "\n")
  !bash run.sh --stage 7 --stop_stage 7 --acoustic-model acoustic --postfilter-model postfilter_bap --postfilter-train bap

# LF0 post filter is not in NNSVS at the time of writing.

#if starting_stage <= 9 and stopping_stage >= 9:
#  print("training lF0 postfilter for " + singer_name + "\n")
#  !bash run.sh --stage 7 --stop_stage 7 --acoustic-model acoustic --postfilter-model postfilter_lf0 --postfilter-train lf0

In [None]:
# Merge the postfilters
# Postfilter training may not work for non-Japanese models.

%cd -q "/content/ETK/train"
!python scripts/merge_postfilters.py exp/{singer_name}_intunist_prototyping_notebook/postfilter_mgc/best_loss.pth \
    exp/{singer_name}_intunist_prototyping_notebook/postfilter_bap/best_loss.pth \
    exp/{singer_name}_intunist_prototyping_notebook/postfilter_merged

In [None]:
#@title Train vocoder (not set up yet!)
# Training NSF may take several days.
print("this doesn't do anything, sorry!")

#print("training vocoder for " + singer_name)
#starting_stage = 8 
#stopping_stage = 10 

#%cd "/content/ETK/train"
#!bash run.sh --stage $starting_stage --stop_stage $stopping_stage

#Release
---

In [None]:
# Run release step and package voice

delete_checkpoint = True 
#`delete_checkpoint` deletes extra checkpoints to reduce singer size. Does not affect quality.

print("Packaging " + singer_name)
%cd "/content/ETK/train"
!bash run.sh --stage 12 --stop_stage 12

import subprocess
time = subprocess.getoutput('date +%y%m%d-%H%M-UTC')

if delete_checkpoint:
    %rm -f /content/ETK/train/release/{singer_name}_---/exp/{singer_name}_intunist_prototyping_notebook/*/checkpoint*
    %rm -f /content/ETK/train/release/{singer_name}_---/exp/{singer_name}_intunist_prototyping_notebook/*/epoch*

%cd -q "/content/ETK/train/release/"
print('\033[97;100m' + 'Compressing Model' + '\033[0m')
!7za -bso0 a "/kaggle/working/{singer_name}_NNSVS_model_{time}.7z" "{singer_name}_---"
    print('Done!')