# _**[DiffSinger](https://github.com/openvpi/DiffSinger)**_
_Singing Voice Synthesis via Shallow Diffusion Mechanism (SVS & TTS)_

\
____

Note:
- This notebook will get update semi-frequently based from the feedback or response from users
- Make sure to compare your file structure to the [data example](https://github.com/usamireko/DiffSinger_colab_notebook_MLo7/blob/main/data_example.md)
- 這是繁體中文翻譯版本，但無法保證所有內容已經中文化，加上原作者隨時會對程式更新，可能無法提供最新版本程式
- 請在你只要輸出 ONNX 檔案的時候使用 export_mode
- 模型會分成兩個部份，分別為 acoustic（基本音色）和 variance（變化音色）模型

```We refer "variance" as "parameters" to avoid the confusion```


\
____
\
#### **This notebook is an edited copy of Kei's Diffsinger [colab notebook](https://colab.research.google.com/drive/1kUg9dz8PPH92NfnLZwgq0_9B9an39t1J?usp=sharing)**
####**This notebook is maintained by MLo7**

___

# **前置程式安裝**

In [None]:
from IPython.display import clear_output, Audio, display, HTML
import os
from google.colab import drive

def setup_onnx_export():
    print("ONNX 輸出模式（ONNX Export Mode）已啟動，正在安裝相關套件")
    !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger
    !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh
    !chmod +x /content/mini.sh
    !bash /content/mini.sh -b -f -p /usr/local
    !conda install -q -y jupyter
    !conda install -q -y google-colab -c conda-forge
    !python -m ipykernel install --name "py310" --user
    print("正在安裝 ONNX 轉換依賴套件 ...")
    !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null
    print("安裝完成，可以輸出 ONNX 檔案了！")

def setup_standard():
    if not os.path.exists("/content/pretrain_models"):
        os.makedirs("/content/pretrain_models")

    !wget https://github.com/MLo7Ghinsan/DiffSinger_colab_notebook_MLo7/releases/download/OU_files/jpn_dict.txt -O /content/jpn_dict.txt
    !rm -rf /content/sample_data
    !apt-get install aria2
    clear_output()
    !git clone https://github.com/UtaUtaUtau/nnsvs-db-converter /content/nnsvs-db-converter
    !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger
    !git clone https://github.com/openvpi/MakeDiffSinger /content/MakeDiffSinger
    !git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans
    !git clone https://github.com/openvpi/SOME /content/SOME
    clear_output()
    !pip install torch torchvision torchaudio
    clear_output()
    !pip install -r /content/DiffSinger/requirements.txt
    !pip install -r /content/SOME/requirements.txt
    !pip install mido einops
    clear_output()
    !wget https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-44.1k-hop512-128bin-2024.02/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -O /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
    !wget https://github.com/openvpi/vocoders/releases/download/pc-nsf-hifigan-44.1k-hop512-128bin-2025.02/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -O /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip
    !wget https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip -O /content/rmvpe.zip
    !wget https://github.com/openvpi/SOME/releases/download/v1.0.0-baseline/0119_continuous128_5spk.zip -O /content/0119_continuous128_5spk.zip
    !wget https://github.com/yxlllc/vocal-remover/releases/download/hnsep_240512/hnsep_240512.zip -O /content/DiffSinger/checkpoints/hnsep_240512.zip
    !unzip -q /content/DiffSinger/checkpoints/hnsep_240512.zip -d /content/DiffSinger/checkpoints
    !unzip -q /content/0119_continuous128_5spk.zip -d /content/DiffSinger/checkpoints/SOME
    !unzip -q /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -d /content/DiffSinger/checkpoints
    !unzip -q /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -d /content/DiffSinger/checkpoints
    !unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints
    !unzip -q /content/rmvpe.zip -d /content/MakeDiffSinger/variance-temp-solution/assets
    !rm /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
    !rm /content/rmvpe.zip
    !rm /content/0119_continuous128_5spk.zip
    !aria2c -d /content/pretrain_models -o acoustic_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/acoustic/model_ckpt_steps_49000.ckpt
    !aria2c -d /content/pretrain_models -o variance_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/variance/model_ckpt_steps_51000.ckpt
    clear_output()
    !pip install --upgrade tensorboard
    clear_output()
    !pip install protobuf
    clear_output()
    !pip install onnxruntime
    clear_output()
    !pip install pydub
    clear_output()

#@title # 掛載 Google 雲端硬碟 & 安裝開始！
export_mode = False # @param {"type":"boolean"}
drive.mount("/content/drive")

if export_mode:
    setup_onnx_export()
else:
    setup_standard()

clear_output()
print("安裝完成啦！")
print("|")
print("|")
print("|")
!git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans 2>/dev/null
chika_dance = '<img src="https://raw.githack.com/MLo7Ghinsan/ghin_shenanigans/main/image_and_gif/chika_dance.gif"/>'
display(HTML(chika_dance))
with open("/content/ghin_shenanigans/audio/setup_complete.wav", "rb") as f:
    setup_complete_sound = f.read()
Audio(data=setup_complete_sound, autoplay=True)

# **預處理訓練資料時間！**

In [None]:
#@title #解壓縮資料
#@markdown ___
%cd /content
#@markdown 這裡會讓你在 Colab 的根目錄（/content）建一個叫做 raw_data 的資料夾 ＆解壓縮你的訓練資料到 raw_data 裡

data_type = "lab + wav (NNSVS format)" # @param ["lab + wav (NNSVS format)", "csv + wav (DiffSinger format)", "ds (DiffSinger format)"]

#@markdown <font size="-1.5"> 你的壓縮檔儲存位置：

data_zip_path = "" #@param {type:"string"}

#@markdown ___

#@markdown nnsvs-db-converter 設定（注意啊！只能有 lab 和 wav 檔案！）

#@markdown <font size="-1.5"> _你可以把數值調到超過訓練資料的大小，用來把段落長度最大化或保留訓練資料原樣_<br>

#@markdown <font size="-1.5"> 這個選項需要在 variance 音調訓練的時候使用
estimate_midi_option = "False" # @param ["False", "True | parselmouth", "True | harvest", "True | SOME"]
if estimate_midi_option == "True | parselmouth":
    estimate_midi = True
    midi_pitch_ext = "parselmouth"
elif estimate_midi_option == "True | harvest":
    estimate_midi = True
    midi_pitch_ext = "harvest"
else:
    estimate_midi = False
    midi_pitch_ext = None
#@markdown <font size="-1.5"> 根據靜音音素位置確定分割數據所需的時間（秒）
segment_length = 15 #@param {type:"slider", min:5, max:35, step:1}

#@markdown <font size="-1.5"> 決定每段聲音能出現多少靜音部份（解釋：有多少段沒有聲音）
max_silence_phoneme_amount = 2 #@param {type:"slider", min:0, max:50, step:1}

# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///
# making the segment length cap at 35 secs because any longer than that would make training goes really slow

# my ass dont remember why i made two... i think one is unnecessary extra but mehhh
all_shits = "/content/raw_data"
all_shits_not_wav_n_lab = "/content/raw_data/diffsinger_db"

import os
import csv
import json
import shutil
from pydub import AudioSegment
import yaml

if os.path.exists("/content/raw_data"):
    shutil.rmtree("/content/raw_data")

if not os.path.exists(all_shits_not_wav_n_lab):
  os.makedirs(all_shits_not_wav_n_lab)

# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3
if not data_type == "lab + wav (NNSVS format)":
    #changed to 7zip to support more compression types
    !7z x "$data_zip_path" -o{all_shits_not_wav_n_lab}
    for root, dirs, files in os.walk(all_shits):
        for filename in files:
            if filename.endswith(".lab"):
                file_path = os.path.join(root, filename)
                with open(file_path, "r") as file:
                    file_data = file.read()
                file_data = file_data.replace("SP", "pau")
                file_data = file_data.replace("br", "AP")
                with open(file_path, "w") as file:
                    file.write(file_data)

else:
    !7z x "$data_zip_path" -o{all_shits_not_wav_n_lab}


# for funny auto dict generator lmao
out = "/content/DiffSinger/dictionaries"
dictionary_files = []
dictionary_conf_lines = []

def is_excluded(phoneme):
    return phoneme in ["pau", "AP", "SP", "sil"]

lang_config_path = all_shits_not_wav_n_lab +"/lang_config.yaml"

if not os.path.exists(lang_config_path):
    extra_phonemes = []
    merged_phoneme_groups = []
    all_phonemes = set()

    for root, dirs, files in os.walk(all_shits_not_wav_n_lab):
        for file in files:
            fpath = os.path.join(root, file)
    # honestly if people still have whatever/phoneme in their single dict, they shouldnt be doing single dict in the first place
            if file.endswith(".lab"):
                with open(fpath, "r") as lab_file:
                    for line in lab_file:
                        parts = line.strip().split()
                        if len(parts) < 3:
                            continue
                        phoneme = parts[2]
                        if "/" in phoneme:
                            _, phoneme = phoneme.split("/", 1)
                        if not is_excluded(phoneme):
                            all_phonemes.add(phoneme)

            elif file.endswith(".csv"):
                with open(fpath, "r", newline="") as csv_file:
                    csv_reader = csv.DictReader(csv_file)
                    for row in csv_reader:
                        if "ph_seq" in row:
                            for phoneme in row["ph_seq"].strip().split():
                                if "/" in phoneme:
                                    _, phoneme = phoneme.split("/", 1)
                                if not is_excluded(phoneme):
                                    all_phonemes.add(phoneme)

            elif file.endswith(".ds"):
                with open(fpath, "r") as json_file:
                    data = json.load(json_file)
                    for entry in data:
                        if "ph_seq" in entry:
                            for phoneme in entry["ph_seq"].strip().split():
                                if "/" in phoneme:
                                    _, phoneme = phoneme.split("/", 1)
                                if not is_excluded(phoneme):
                                    all_phonemes.add(phoneme)

    os.makedirs(out, exist_ok=True)
    custom_dict_path = os.path.join(out, "dictionary-custom.txt")
    dictionary_files.append(custom_dict_path)
    dictionary_conf_lines.append(f"custom: '{custom_dict_path}'")
    with open(custom_dict_path, "w", encoding="utf-8") as out_file:
        for phoneme in sorted(all_phonemes):
            out_file.write(f"{phoneme}\t{phoneme}\n")
    lang_dict = None

else:
    with open(lang_config_path, "r") as yaml_file:
        lang_config = yaml.safe_load(yaml_file)

    languages = lang_config.get("languages", [])
    extra_phonemes = lang_config.get("extra_phonemes", [])
    merged_phoneme_groups = lang_config.get("merged_phoneme_groups", [])

    lang_dict = {lang: set() for lang in languages}

    for folder in os.listdir(all_shits_not_wav_n_lab):
        if "." in folder:
            _, lang_code = folder.rsplit(".", 1)
            if lang_code not in languages:
                continue

            phoneme_folder_path = os.path.join(all_shits_not_wav_n_lab, folder)

            for root, dirs, files in os.walk(phoneme_folder_path):
                for file in files:
                    fpath = os.path.join(root, file)

                    if data_type == "lab + wav (NNSVS format)":
                        if file.endswith(".lab"):
                            with open(fpath, "r") as lab_file:
                                for line in lab_file:
                                    line = line.strip()
                                    if not line:
                                        continue
                                    parts = line.split()
                                    if len(parts) < 3:
                                        continue
                                    phoneme = parts[2]
                                    if "/" in phoneme:
                                        lang_hint, actual_phoneme = phoneme.split("/", 1)
                                        if lang_hint in languages and not is_excluded(actual_phoneme):
                                            lang_dict[lang_hint].add(actual_phoneme)
                                        continue
                                    if not is_excluded(phoneme):
                                        lang_dict[lang_code].add(phoneme)

                    elif data_type == "csv + wav (DiffSinger format)":
                        if file.endswith(".csv"):
                            with open(fpath, "r", newline="") as csv_file:
                                csv_reader = csv.DictReader(csv_file)
                                for row in csv_reader:
                                    if "ph_seq" in row:
                                        ph_seq = row["ph_seq"].strip()
                                        for phoneme in ph_seq.split():
                                            if "/" in phoneme:
                                                lang_hint, actual_phoneme = phoneme.split("/", 1)
                                                if lang_hint in languages and not is_excluded(actual_phoneme):
                                                    lang_dict[lang_hint].add(actual_phoneme)
                                                continue
                                            if not is_excluded(phoneme):
                                                lang_dict[lang_code].add(phoneme)

                    else:
                        if file.endswith(".ds"):
                            with open(fpath, "r") as json_file:
                                data = json.load(json_file)
                                for entry in data:
                                    if "ph_seq" in entry:
                                        ph_seq = entry["ph_seq"].strip()
                                        for phoneme in ph_seq.split():
                                            if "/" in phoneme:
                                                lang_hint, actual_phoneme = phoneme.split("/", 1)
                                                if lang_hint in languages and not is_excluded(actual_phoneme):
                                                    lang_dict[lang_hint].add(actual_phoneme)
                                                continue
                                            if not is_excluded(phoneme):
                                                lang_dict[lang_code].add(phoneme)

    for lang, ph_set in lang_dict.items():
        output_path = os.path.join(out, f"dictionary-{lang}.txt")
        dictionary_files.append(output_path)
        dictionary_conf_lines.append(f"{lang}: '{output_path}'")
        with open(output_path, "w", encoding="utf-8") as out_file:
            for phoneme in sorted(ph_set):
                out_file.write(f"{phoneme}\t{phoneme}\n")

# used this for check runs
#for dicks in dictionary_files:
#    print(dicks)

# for vowels and consonants.txt.... well adding luquid type for uta's script
dict_path = out
vowel_types = {"a", "i", "u", "e", "o", "N", "M", "NG"}
liquid_types = {"y", "w", "l", "r"} # r for english labels, it should be fine with jp too
vowel_data = []
consonant_data = []
liquid_data = []

for dict_path in dictionary_files:
    with open(dict_path, "r") as f:
        for line in f:
            phoneme, _ = line.strip().split("\t")
            if phoneme[0] in vowel_types:
                vowel_data.append(phoneme)
            elif phoneme[0] in liquid_types:
                liquid_data.append(phoneme)
            else:
                consonant_data.append(phoneme)

vowel_data.sort()
liquid_data.sort()
consonant_data.sort()
directory = os.path.dirname(dict_path)

# make txt for language json file
vowel_txt_path = os.path.join(directory, "vowels.txt")
with open(vowel_txt_path, "w") as f:
    f.write(" ".join(vowel_data))
liquid_txt_path = os.path.join(directory, "liquids.txt")
with open(liquid_txt_path, "w") as f:
    f.write(" ".join(liquid_data))
consonant_txt_path = os.path.join(directory, "consonants.txt")
with open(consonant_txt_path, "w") as f:
    f.write(" ".join(consonant_data))


# here's a funny json append
with open(vowel_txt_path, "r") as f:
    vowel_data = f.read().split()
with open(liquid_txt_path, "r") as f:
    liquid_data = f.read().split()
with open(consonant_txt_path, "r") as f:
    consonant_data = f.read().split()
liquid_list = {liquid: True for liquid in liquid_data} #temp fix, might need more research about the push in timing'''
phones4json = {"vowels": vowel_data, "liquids": liquid_list}
with open("/content/nnsvs-db-converter/lang.sample.json", "w") as rawr:
    json.dump(phones4json, rawr, indent=4)


if data_type == "lab + wav (NNSVS format)":
    db_converter_script = "/content/nnsvs-db-converter/db_converter.py"
    for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):
        raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)
        if os.path.isdir(raw_folder_path):
            if estimate_midi:
                !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -m -c -L "/content/nnsvs-db-converter/lang.sample.json" {raw_folder_path}
            else:
                !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -L "/content/nnsvs-db-converter/lang.sample.json" {raw_folder_path}
            !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab
            !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null
            !rm -rf {raw_folder_path}/diffsinger_db
            if estimate_midi_option == "True | SOME":
                !python /content/SOME/batch_infer.py --model "/content/DiffSinger/checkpoints/SOME/0119_continuous256_5spk/model_ckpt_steps_100000_simplified.ckpt" --dataset {raw_folder_path} --overwrite

elif data_type == "ds (DiffSinger format)":
    ds_segment_script = "/content/ghin_shenanigans/scripts/ds_segmentor.py"
    ds2csv_script = "/content/MakeDiffSinger/variance-temp-solution/convert_ds.py"
    for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):
        raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)
        if os.path.isdir(raw_folder_path):
            ds_exp_path = os.path.join(raw_@tag:notebookLayoutfolder_path, "ds")
            csv_exp_path = os.path.join(raw_folder_path, "transcriptions.csv")
            !python {ds_segment_script} {raw_folder_path} --export_path {ds_exp_path}
            !rm -rf {raw_folder_path}/*.ds #clean it cus why not
            !python {ds2csv_script} ds2csv {ds_exp_path} {csv_exp_path}
else:
    pass

# make it replace the first SP to AP cus it seems like people always forgot about it
for root, _, files in os.walk(all_shits_not_wav_n_lab):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            with open(file_path, "r", newline="") as input_file:
                csv_reader = csv.reader(input_file)
                data = [row for row in csv_reader]
                header = data[0]
                if "ph_seq" in header:
                    ph_seq_index = header.index("ph_seq")
                    if len(data) > 1 and len(data[1]) > ph_seq_index:
                        data[1][ph_seq_index] = data[1][ph_seq_index].replace("SP", "AP", 1)
            with open(file_path, "w", newline="") as output_file:
                csv_writer = csv.writer(output_file)
                csv_writer.writerows(data)

print("解壓縮完成了!")
print("|")
print("|")
print("|")
print("轉換完了！！ :)")

In [None]:
#@title #修改設定檔
#@markdown ___

import re
import os
import yaml
import random #for the random test files lmaoz

%cd /content
clear_output()
#@markdown 你想訓練的模型類型
model_type = "acoustic（基本音色）" # @param ["acoustic（基本音色）", "variance（變化音色）"]
config_cap = model_type.upper()
diffusion_type = "reflow" # @param ["ddpm", "reflow"]
diff_accelerator = "unipc" # @param ["ddim", "pndm", "dpm-solver", "unipc"]
loss_type = "l2" # @param ["l1", "l2"]

spk_name = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]
# i used spk_name for something else cus i forgor now imma just copy and paste it
spk_names = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]
num_spk = len(spk_name)
num_lang = len(dictionary_files)
raw_dir = []
datasets = []
for folder_name in spk_name:
    folder_path = os.path.join(all_shits_not_wav_n_lab, folder_name)
    raw_dir.append(folder_path)
folder_to_id = {folder_name: i for i, folder_name in enumerate(spk_name)}

if num_spk == 1:
    singer_type = "SINGLE-SPEAKER"
    use_spk_id = False

    for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):
        if data_type == "ds (DiffSinger format)":
            audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(".ds")]
        else:
            audio_files = [f[:-4] for f in os.listdir(folder_path + "/wavs") if f.endswith(".wav")]
        folder_id = folder_to_id.get(speaker_name, -1)
        prefixed_audio_files = [f"{audio_file}" for audio_file in audio_files]

        random_ass_test_files = prefixed_audio_files[:3]

        speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time

        datasets.append({
        "raw_data_dir": folder_path,
        "speaker": speaker_name,
        "spk_id": 0,
        "language": "custom",
        "test_prefixes": random_ass_test_files
    })
else:
    singer_type = "MULTI-SPEAKER"
    use_spk_id = True

    for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):
        if data_type == "ds (DiffSinger format)":
            audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(".ds")]
        else:
            audio_files = [f[:-4] for f in os.listdir(folder_path + "/wavs") if f.endswith(".wav")]
        folder_id = folder_to_id.get(speaker_name, -1)
        prefixed_audio_files = [f"{audio_file}" for audio_file in audio_files]

        random_ass_test_files = prefixed_audio_files[:3]

        speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time

        datasets.append({
            "raw_data_dir": folder_path,
            "speaker": speaker_name,
            "spk_id": spk_id,
            "language": lang_id.lstrip(".") or "custom",
            "test_prefixes": random_ass_test_files
        })

dictionaries = {}
for line in dictionary_conf_lines:
    key, value = line.split(": ", 1)
    dictionaries[key] = value.strip("'")

#@markdown Shallow Diffusion 訓練
use_shallow_diffusion = "true | gt_val" # @param ["false", "true | aux_val", "true | gt_val"]
if use_shallow_diffusion == "false":
    shallow = False
    gt_shallow = False
elif use_shallow_diffusion == "true | aux_val":
    shallow = True
    gt_shallow = False
else:
    shallow = True
    gt_shallow = True

#@markdown 半精度（Half precision）或混合精度（mixed precision）可以加速訓練過程（[來源](https://lightning.ai/docs/pytorch/stable/common/trainer.html#precision)）
# the reason why i dont add 64 is because colab is already dreadfully slow at 32 so yes im leaving it out
precision = "16-mixed" # @param ["32-true", "bf16-mixed", "16-mixed", "bf16", "16"]

#@markdown 你的模型儲存位置
save_dir = "" #@param {type:"string"}

binary_save_dir = save_dir + "/binary"

conf_dir = save_dir

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................

#@markdown 用base model 進行微調

enable_finetuning = False # @param {type:"boolean"}


#@markdown 自訂 base model 的位置（如果你想要用自己的 base model，不需要請留空。預設模型來源按[我](https://github.com/haru0l/diffsinger_models)）
#wtf haru i just looked at your readme"""""

base_model_path = "" # @param {type:"string"}

if enable_finetuning:
    pretrain = True
    if base_model_path:
        pretrain_ckpt = base_model_path
    else:
        pretrain_ckpt = f"/content/pretrain_models/{model_type}_pretrain.ckpt"
    finetune_strict_shapes = False
    finetune_ckpt_path = pretrain_ckpt
else:
    pretrain = False
    finetune_strict_shapes = True #default value
    finetune_ckpt_path = None #default value

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................

#@markdown 模型嵌入檢查：張力、能量、氣息、發聲特性（適用於 <font color = "yellow">acoustic & variance 模型

#@markdown <font size="-1.5"> 我們限制了配對選項，因為要避免出現品質和使用問題（注意：如果你想要使用這些選項以外的設定，除了 energy + breathiness 選項以外，其他可能沒辦法正常運作）

selected_param = "能量＋氣息（energy + breathiness）" # @param ["能量（energy）", "氣息（breathiness）", "能量＋氣息（energy + breathiness）", "張力（tension）", "發聲特性（voicing）", "張力＋發聲特性（tension + voicing）", "無（none）"]
param_flags = {
    "能量（energy）":               {"tension": False, "energy": True,  "breathiness": False, "voicing": False},
    "氣息（breathiness）":          {"tension": False, "energy": False, "breathiness": True,  "voicing": False},
    "能量＋氣息（energy + breathiness）": {"tension": False, "energy": True,  "breathiness": True,  "voicing": False},
    "張力（tension）":              {"tension": True,  "energy": False, "breathiness": False, "voicing": False},
    "發聲特性（voicing）":              {"tension": False, "energy": False, "breathiness": False, "voicing": True},
    "張力＋發聲特性（tension + voicing）":    {"tension": True,  "energy": False, "breathiness": False, "voicing": True},
    "無（none）":                 {"tension": False, "energy": False, "breathiness": False, "voicing": False},
}

flags = param_flags.get(selected_param, param_flags["none"])

tension_training    = flags["tension"]
energy_training     = flags["energy"]
breathiness_training = flags["breathiness"]
voicing_training    = flags["voicing"]

parameter_extraction_method = "vr" # @param ["vr", "world"]

### forcing data aug to be true by default cus i dont think anyone would disable it and its good to be on by default
data_aug = True #param {type:"boolean"}

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................

#@markdown Model 訓練檢查 | <font color = "yellow">variance </font>模型限定



#@markdown <font size="-1.5"> 如果因為技術問題，你想用 glide embed 訓練的話，請手動打開 :)
pitch_training = "False" # @param ["False", "True | Standard", "True | MelodyEncoder"]
if pitch_training == "False":
    pitch_training = False
    use_melody_encoder = False
    use_glide_embed = False
elif pitch_training == "True | Standard":
    pitch_training = True
    use_melody_encoder = False
    use_glide_embed = False
else:
    pitch_training = True
    use_melody_encoder = True
    use_glide_embed = False

duration_training = True #@param {type: "boolean"}

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................

#@markdown 音高提取演算法

f0_ext = "parselmouth" # @param ["parselmouth", "rmvpe", "harvest"]
if f0_ext == "rmvpe":
    pe_ckpt_pth = "checkpoints/rmvpe/model.pt"
else:
    pe_ckpt_pth = None

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................

#@markdown 接下來的參數設定會影響到模型的品質和大小，到[這裡](https://github.com/openvpi/DiffSinger/blob/main/docs/ConfigurationSchemas.md)閱讀說明

#@markdown 如果你不知道這些設定的功能，<font color = "red">請你保留預設值</font>，否則會對模型造成不良影響

#@markdown <font size="-2.5">歡迎你來測試～

#@markdown <font size="-1.5">model_hidden_size: FS2 的隱藏 layers＆token 嵌入大小

#@markdown <font size="-1.5">model_residual_layers | model_residual_channels: 模型的主要 layers 跟 channels

#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................
#@markdown Model's network/layer size for <font color = "yellow">acoustic 模型

#@markdown <font size="-1.5"> samplig_algorithm 的品質有一定的順序，從精確度最低（Euler）到最高（RK5）。雖然大多數情況下，Euler 也能正常運作就是了
sampling_algorithm = "euler" # @param ["euler", "rk2", "rk4", "rk5"]

acoustic_hidden_size = 256 # @param {type:"slider", min:2, max:1024, step:2}

acoustic_num_layers = 6 # @param {type:"slider", min:2, max:42, step:2}
acoustic_num_channels = 1024 # @param {type:"slider", min:2, max:2048, step:2}

#@markdown <font color = "yellow">variance 模型</font> 的 network / layer 大小
variance_hidden_size = 256 # @param {type:"slider", min:2, max:1024, step:2}
duration_hidden_size = 512 # @param {type:"slider", min:2, max:1024, step:2}
melody_encoder_hidden_size = 128 # @param {type:"slider", min:2, max:1024, step:2}

pitch_num_layers = 6 # @param {type:"slider", min:2, max:100, step:2}
pitch_num_channels = 512 # @param {type:"slider", min:2, max:1024, step:2}
variance_num_layers = 6 # @param {type:"slider", min:2, max:100, step:2}
variance_num_channels = 384 # @param {type:"slider", min:2, max:1024, step:2}



with open("/content/DiffSinger/configs/base.yaml", "r") as config:
    mother = yaml.safe_load(config)
mother["pl_trainer_precision"] = precision
with open("/content/DiffSinger/configs/base.yaml", "w") as config:
    yaml.dump(mother, config)

if  data_type == "ds (DiffSinger format)":
    prefer_ds = True
else:
    prefer_ds = False

if model_type == "acoustic":
    with open("/content/DiffSinger/configs/acoustic.yaml", "r") as config:
        bitch_ass_config = yaml.safe_load(config)
    bitch_ass_config["datasets"] = datasets
    bitch_ass_config["num_spk"] = num_spk
    bitch_ass_config["use_spk_id"] = use_spk_id
    bitch_ass_config["extra_phonemes"] = extra_phonemes
    bitch_ass_config["merged_phoneme_groups"] = merged_phoneme_groups
    bitch_ass_config["use_lang_id"] = bool(merged_phoneme_groups)
    bitch_ass_config["num_lang"] = num_lang
    bitch_ass_config["pretrain"] = pretrain
    bitch_ass_config["diffusion_type"] = diffusion_type
    bitch_ass_config["diff_accelerator"] = diff_accelerator
    bitch_ass_config["main_loss_type"] = loss_type
    bitch_ass_config["binary_data_dir"] = binary_save_dir
    bitch_ass_config["dictionaries"] = dictionaries
    bitch_ass_config["augmentation_args"]["random_pitch_shifting"]["enabled"] = data_aug
    bitch_ass_config["augmentation_args"]["random_time_stretching"]["enabled"] = data_aug
    bitch_ass_config["use_key_shift_embed"] = data_aug
    bitch_ass_config["use_speed_embed"] = data_aug
    bitch_ass_config["pe"] = f0_ext
    bitch_ass_config["use_energy_embed"] = energy_training
    bitch_ass_config["use_breathiness_embed"] = breathiness_training
    bitch_ass_config["use_tension_embed"] = tension_training
    bitch_ass_config["use_voicing_embed"] = voicing_training

    bitch_ass_config["pe_ckpt"] = pe_ckpt_pth
    bitch_ass_config["tension_smooth_width"] = 0.06 #0.12
    #shallow diff stuff
    bitch_ass_config["use_shallow_diffusion"] = shallow
    bitch_ass_config["shallow_diffusion_args"]["val_gt_start"] = gt_shallow
    #finetue stuff
    bitch_ass_config["finetune_enabled"] = enable_finetuning
    bitch_ass_config["finetune_ckpt_path"] = finetune_ckpt_path
    bitch_ass_config["finetune_strict_shapes"] = finetune_strict_shapes
    #vr
    bitch_ass_config["hnsep"] = parameter_extraction_method
    #layers
    bitch_ass_config["sampling_algorithm"] = sampling_algorithm
    bitch_ass_config["hidden_size"] = acoustic_hidden_size
    bitch_ass_config["backbone_type"] = "lynxnet"
    bitch_ass_config["backbone_args"]["num_layers"] = acoustic_num_layers
    bitch_ass_config["backbone_args"]["num_channels"] = acoustic_num_channels

    with open("/content/DiffSinger/configs/acoustic.yaml", "w") as config:
        yaml.dump(bitch_ass_config, config)
else:
    with open("/content/DiffSinger/configs/variance.yaml", "r") as config:
        bitch_ass_config = yaml.safe_load(config)
    bitch_ass_config["datasets"] = datasets
    bitch_ass_config["num_spk"] = num_spk
    bitch_ass_config["use_spk_id"] = use_spk_id
    bitch_ass_config["extra_phonemes"] = extra_phonemes
    bitch_ass_config["merged_phoneme_groups"] = merged_phoneme_groups
    bitch_ass_config["use_lang_id"] = bool(merged_phoneme_groups)
    bitch_ass_config["num_lang"] = num_lang
    bitch_ass_config["main_loss_type"] = loss_type
    bitch_ass_config["diffusion_type"] = diffusion_type
    bitch_ass_config["diff_accelerator"] = diff_accelerator
    bitch_ass_config["binary_data_dir"] = binary_save_dir
    bitch_ass_config["dictionaries"] = dictionaries
    bitch_ass_config["pe"] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo
    bitch_ass_config["pe_ckpt"] = pe_ckpt_pth #same goes to this one
    bitch_ass_config["tension_smooth_width"] = 0.06 #0.12

    bitch_ass_config["predict_energy"] = energy_training
    bitch_ass_config["predict_breathiness"] = breathiness_training
    bitch_ass_config["predict_tension"] = tension_training
    bitch_ass_config["predict_pitch"] = pitch_training
    bitch_ass_config["predict_voicing"] = voicing_training

    bitch_ass_config["use_melody_encoder"] = use_melody_encoder
    bitch_ass_config["use_glide_embed"] = use_glide_embed
    bitch_ass_config["predict_dur"] = duration_training
    bitch_ass_config["binarization_args"]["prefer_ds"] = prefer_ds
    #finetune stuff
    bitch_ass_config["finetune_enabled"] = enable_finetuning
    bitch_ass_config["finetune_ckpt_path"] = finetune_ckpt_path
    bitch_ass_config["finetune_strict_shapes"] = finetune_strict_shapes
    #vr
    bitch_ass_config["hnsep"] = parameter_extraction_method
    bitch_ass_config["hnsep_ckpt"] = "checkpoints/vr/model.pt"
    #layers
    bitch_ass_config["hidden_size"] = variance_hidden_size
    bitch_ass_config["dur_prediction_args"]["hidden_size"] = duration_hidden_size
    bitch_ass_config["melody_encoder_args"]["hidden_size"] = melody_encoder_hidden_size
    bitch_ass_config["variances_prediction_args"]["backbone_type"] = "lynxnet"
    bitch_ass_config["variances_prediction_args"]["backbone_args"]["num_layers"] = variance_num_layers
    bitch_ass_config["variances_prediction_args"]["backbone_args"]["num_channels"] = variance_num_channels
    bitch_ass_config["pitch_prediction_args"]["backbone_type"] = "lynxnet"
    bitch_ass_config["pitch_prediction_args"]["backbone_args"]["num_layers"] = pitch_num_layers
    bitch_ass_config["pitch_prediction_args"]["backbone_args"]["num_channels"] = pitch_num_channels

    with open("/content/DiffSinger/configs/variance.yaml", "w") as config:
        yaml.dump(bitch_ass_config, config)

os.makedirs(save_dir, exist_ok=True)


with open("/content/DiffSinger/utils/hparams.py", "r") as f:
    hparams_py_read = f.read()
hparams_py_read = re.sub(r"args_work_dir\s*=\s*.*", f"args_work_dir = '{save_dir}'", hparams_py_read)
with open("/content/DiffSinger/utils/hparams.py", "w") as f:
    f.write(hparams_py_read)

with open("/content/DiffSinger/utils/training_utils.py", "r") as f:
    training_utils_stuff = f.read()
training_utils_stuff = re.sub("relative_path\s*=\s*.*", "relative_path = filepath.relative_to(Path('/content').resolve())", training_utils_stuff)
with open("/content/DiffSinger/utils/training_utils.py", "w") as f:
    f.write(training_utils_stuff)

spk_names = [os.path.splitext(name)[0] for name in spk_names]
dict_dir = os.path.dirname(dict_path)

print("config 更新完啦！以下是你的設定：")
print("|")
print("|")
print("|")
print(f"+++---{config_cap} {singer_type} 訓練中---+++")
print("|")
print("|")
print("|")
print("+++---使用者的設定---+++")
print("\n")
print(f"speaker name: {spk_names}")
print("\n")
print(f"data augmentation: {data_aug}")
print("\n")
print(f"pitch extractor: {f0_ext}")
print("\n")
print(f"binary data 的儲存位置: {binary_save_dir}")
print("\n")
print(f"你的模型被存到了: {save_dir}")
print("\n")
print("==========================================================================================")
print("\n")
print("+++---其他被自動設定的東東---+++")
#print("\n")
#print(f"test files (auto selected): {random_ass_test_files}")
print("\n")
print(f"dictionary (auto generated): {dict_dir} (去看看吧！)")
print("\n")
print("==========================================================================================")
print("\n")
print("如果你對其中的任何設定不滿意，")
print(f"可以到 [/content/DiffSinger/configs/{model_type}.yaml] 修改設定喔！")


In [None]:
#@markdown # 預處理資料
import os
#we dont need that old f0 limit change anymore <3
training_config = f"/content/DiffSinger/configs/{model_type}.yaml"
%cd /content/DiffSinger
os.environ['PYTHONPATH']='.'
!CUDA_VISIBLE_DEVICES=0 python /content/DiffSinger/scripts/binarize.py --config {training_config} --reset

# **訓練時間到！**

In [None]:
#@markdown #訓練你的模型吧！
%cd /content/DiffSinger
import re
import os
import yaml
#@markdown ___

#@markdown <font size="-1.5"> 每幾步驗證＆儲存模型一次？
save_interval = 2000 #@param {type:"slider", min:100, max:10000, step:100}

#@markdown <font size="-1.5"> batch 大小設定：太小會導致出現瓶頸，太大可能會讓記憶體爆掉（OOM）
batch_size = 9 # @param {type:"slider", min:1, max:100, step:1}

#@markdown <font size="-1.5">  每幾步檢查一次要不要停止訓練？
max_updates = 160000 # @param {type:"slider", min:100, max:2000000, step:100}

#@markdown ___

#@markdown ###**請只在你要繼續訓練的時候修改這裡！**
resume_training = False #@param {type:"boolean"}

#@markdown <font size="-1.5"> 如果你已經把資料轉成二進位格式，請修改這個選項。這個選項會在 config 裡面只放入二進位資料的格式，而且 binary 資料夾必須跟 config.yaml 放一起
local_data = False #@param {type:"boolean"}

#@markdown <font size="-1.5"> 你想把訓練完，生成的 config 檔案放哪裡？
re_config_path = "" #@param {type:"string"}
model_dir = os.path.dirname(re_config_path)
save_dir = model_dir
if resume_training:
    with open("/content/DiffSinger/utils/hparams.py", "r") as f:
        hparams_py_read = f.read()
    hparams_py_read = re.sub(r"args_work_dir\s*=\s*.*", f"args_work_dir = '{save_dir}'", hparams_py_read)
    with open("/content/DiffSinger/utils/hparams.py", "w") as f:
        f.write(hparams_py_read)
    with open("/content/DiffSinger/utils/training_utils.py", "r") as f:
        training_utils_stuff = f.read()
    training_utils_stuff = re.sub("relative_path\s*=\s*.*", "relative_path = filepath.relative_to(Path('/content').resolve())", training_utils_stuff)
    with open("/content/DiffSinger/utils/training_utils.py", "w") as f:
        f.write(training_utils_stuff)

    config_path = re_config_path
    log_dir = save_dir

    !cp {model_dir}/*.txt /content/DiffSinger/dictionaries

else:
    config_path = training_config
    log_dir = conf_dir

with open(config_path, "r") as config:
    ehe = yaml.safe_load(config)
config_dir = os.path.dirname(config_path)
yuh = os.path.join(config_dir, "binary")

ehe["val_check_interval"] = save_interval
ehe["max_batch_size"] = batch_size
ehe["max_updates"] = max_updates
if local_data:
    ehe["binary_data_dir"] = yuh
with open(config_path, "w") as config:
    yaml.dump(ehe, config)

logs = log_dir
%reload_ext tensorboard
%tensorboard --logdir {logs}/lightning_logs

!python /content/DiffSinger/scripts/train.py --config {config_path} --exp_name ${save_dir} --reset

# **把模型轉成 ONNX 格式吧！**

In [None]:
#@markdown # 把真實歌手的資訊從模型移除（非強制）
#@markdown ___
#@markdown <font size="-1.5"> 這個功能可以幫你把真實歌手的資訊從模型移除再公開到網路上。這需要同時對 acoustic 和 variance 模型進行動作

drop_model_path = '' #@param {type: "string"}
#@markdown <font size="-1.5"> 輸入你想保留的真實歌手 ID，請用英文逗號分開。例如："0,3,4" <br>
#@markdown <font size="-1.5"> Note: 你可以在 model 資料夾中的 ```spk_map.json``` 找到真實歌手在模型中的 ID<br>
#@markdown <font size="-1.5"> 如果你看到 ```{"natural": 0, "power": 1, "silly": 2}``` 但只想保留 "natural" 和 "power" 的話，在底下輸入 ```0,1```
retain_speakers = '' #@param {type: "string"}
#@markdown <font size="-1.5"> 如果你不知道這是什麼，請不要動它
fill_embed = 'zeros' #@param ['zeros', 'random', 'mean', 'cyclic']

drop_out_path = drop_model_path[:-5] + '_spk-dropped.ckpt'

!python /content/DiffSinger/scripts/drop_spk.py {drop_model_path} {drop_out_path} --retain {retain_speakers} --fill {fill_embed}




In [None]:
#@markdown # 輸出 ONNX 資料
#@markdown ___
%cd /content
from IPython.display import clear_output
clear_output()
import os
import zipfile
import shutil

if export_mode:
  pass
else:
    print("正在安裝 ONNX 的相關程式 ...（不然動不了啊啊啊！！！）")
    !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh
    !chmod +x /content/mini.sh
    !bash /content/mini.sh -b -f -p /usr/local
    !conda install -q -y jupyter
    !conda install -q -y google-colab -c conda-forge
    !python -m ipykernel install --name "py310" --user
    print("等一下，還有依賴套件要裝 ...")
    !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null
    print("終於裝好了，是時候把 ONNX 輸出了！")
# to counter IF the user is to re-run this cell <3
if os.path.exists("/content/OU_compatible_files"):
    shutil.rmtree("/content/OU_compatible_files")
    os.remove("/content/jpn_dict.txt")
else:
    pass

#@markdown  如果你需要看 ONNX 的輸出過程，請把 no_output 選項取消（預設為打勾）
no_output = True # @param {type:"boolean"}

#@markdown  在這裡貼上 **ACOUSTIC CHECKPOINT** 的存檔路徑，如果沒有就留空（我會自己在資料夾裡找到最新的 checkpoint）
acoustic_checkpoint_path = "" #@param{type:"string'}
acoustic_folder_name = os.path.basename(os.path.dirname(acoustic_checkpoint_path)) + "_acoustic"
acoustic_folder_path = os.path.dirname(acoustic_checkpoint_path)

#@markdown  在這裡貼上 **VARIANCE CHECKPOINT** 的存檔路徑，如果沒有就留空（我會自己在資料夾裡找到最新的 checkpoint）
variance_checkpoint_path = "" #@param{type:"string'}
variance_folder_name = os.path.basename(os.path.dirname(variance_checkpoint_path)) + "_variance"
variance_folder_path = os.path.dirname(variance_checkpoint_path)

#@markdown  在這裡輸入你想要存 ONNX 的位置（接下來會在這裡建一個叫 "onnx" 的資料夾）
exp_folder = "" #@param{type:"string"}

acoustic_onnx_exp = exp_folder + "/onnx/acoustic"
variance_onnx_exp = exp_folder + "/onnx/variance"

if not acoustic_checkpoint_path:
    print("\n")
    print("你沒有輸入 acoustic ckeckpoint 的存檔位置，這樣不會輸出 acoustic ONNX 喔！")
else:
    print("\n")
    print("開始把 acoustic 轉換成 ONNX 中...")
    #cp stuff cus apparently exporter doesnt work without it
    !cp {acoustic_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{acoustic_folder_name}
    search_text = "        args_work_dir = os.path.join("
    replacement = f"        args_work_dir = '{acoustic_folder_path}'"
    with open("/content/DiffSinger/utils/hparams.py", "r") as file:
        lines = file.readlines()
    for i, line in enumerate(lines):
        if search_text in line:
            lines[i] = replacement + "\n"
            break
    with open("/content/DiffSinger/utils/hparams.py", "w") as file:
            file.writelines(lines)
    #incase if anyone wanna change it lmao
    search_text_alt = "        args_work_dir = '"
    replacement_alt = f"        args_work_dir = '{acoustic_folder_path}'"
    with open("/content/DiffSinger/utils/hparams.py", "r") as file:
        lines = file.readlines()
    for i, line in enumerate(lines):
        if search_text_alt in line:
            lines[i] = replacement_alt + "\n"
            break
    with open("/content/DiffSinger/utils/hparams.py", "w") as file:
            file.writelines(lines)

    if no_output:
        !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic >/dev/null 2>&1
    else:
        !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic


if not variance_checkpoint_path:
    print("\n")
    print("你沒有輸入 variance ckeckpoint 的存檔位置，這樣不會把 variance ONNX 輸出喔！")
else:
    print("\n")
    print("開始把 variance 轉換成 ONNX 了！")
    #cp stuff cus apparently exporter doesnt work without it
    !cp {variance_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{variance_folder_name}
    search_text = "        args_work_dir = os.path.join("
    replacement = f"        args_work_dir = '{variance_folder_path}'"
    with open("/content/DiffSinger/utils/hparams.py", "r") as file:
        lines = file.readlines()
    for i, line in enumerate(lines):
        if search_text in line:
            lines[i] = replacement + "\n"
            break
    with open("/content/DiffSinger/utils/hparams.py", "w") as file:
            file.writelines(lines)
    #incase if anyone wanna change it lmao
    search_text_alt = "        args_work_dir = '"
    replacement_alt = f"        args_work_dir = '{variance_folder_path}'"
    with open("/content/DiffSinger/utils/hparams.py", "r") as file:
        lines = file.readlines()
    for i, line in enumerate(lines):
        if search_text_alt in line:
            lines[i] = replacement_alt + "\n"
            break
    with open("/content/DiffSinger/utils/hparams.py", "w") as file:
            file.writelines(lines)
    if no_output:
        !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance >/dev/null 2>&1
    else:
        !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance


if not variance_checkpoint_path:
    folder_paths = [acoustic_onnx_exp]
elif not acoustic_checkpoint_path:
    folder_paths = [variance_onnx_exp]
else:
    folder_paths = [acoustic_onnx_exp, variance_onnx_exp]

patterns = {"acoustic.onnx": "acoustic.onnx", "dur.onnx": "dur.onnx", "linguistic.onnx": "linguistic.onnx", "pitch.onnx": "pitch.onnx", "variance.onnx": "variance.onnx", "phonemes.txt": "phonemes.txt"}

for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        for pattern, new_name in patterns.items():
            if pattern in filename:
                old_path = os.path.join(folder_path, filename)
                new_path = os.path.join(folder_path, new_name)
                if os.path.exists(old_path):
                    os.rename(old_path, new_path)
for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        if "acoustic_acoustic." in filename:
            new_filename = filename.replace("acoustic_acoustic.", "acoustic_")
        elif "variance_variance." in filename:
            new_filename = filename.replace("variance_variance.", "variance_")
        else:
            new_filename = filename
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_filename)
        os.rename(old_path, new_path)
print("\n")
print("ONNX 轉換完了!到 https://github.com/xunmengshe/OpenUtau/wiki/Voicebank-Development 看看怎麼讓你的模型跟 OpenUTAU 能相容！")
print("\n")
print("或用建置 OpenUTAU 聲音庫（Build OpenUtau VB）部份做設定")


# **其他雜七雜八的東東**

In [None]:
#@title #轉換 raw data
#@markdown ___
%cd /content
#@markdown 這個部份會把 .lab 和 .ds 檔案連同訓練完的資料一起匯出

data_type = "lab + wav (NNSVS 格式)" # @param ["lab + wav (NNSVS 格式)"]

#@markdown <font size="-1.5"> 你的壓縮檔資料儲存位置

data_zip_path = "" #@param {type:"string"}

#@markdown <font size="-1.5"> 你的資料儲存路徑

data_save_path = "" #@param {type:"string"}

#@markdown ___

export_ds = True

#@markdown <font size="-1.5"> _你可以把數值調到超過訓練資料的大小，用來把段落長度最大化或保留訓練資料原樣_

#@markdown <font size="-1.5"> 靜音部份位置調整分割訓練資料需要的時間（秒）
segment_length = 15 #@param {type:"slider", min:5, max:35, step:1}

#@markdown <font size="-1.5"> 每個切割片段允許的靜音部份數量上限
max_silence_phoneme_amount = 2 #@param {type:"slider", min:0, max:50, step:1}

# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///
# making the segment length cap at 35 secs because any longer than that would make training goes really slow

# my ass dont remember why i made two... i think one is unnecessary extra but mehhh
all_shits = "/content/raw_data"
all_shits_not_wav_n_lab = "/content/raw_data/diffsinger_db"

import os
import csv
import json
import shutil
from pydub import AudioSegment

if os.path.exists("/content/raw_data"):
    shutil.rmtree("/content/raw_data")

if not os.path.exists(all_shits_not_wav_n_lab):
  os.makedirs(all_shits_not_wav_n_lab)

# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3
if not data_type == "lab + wav (NNSVS format)":
    #changed to 7zip to support more compression types
    !7z x "$data_zip_path" -o{all_shits_not_wav_n_lab}
    for root, dirs, files in os.walk(all_shits):
        for filename in files:
            if filename.endswith(".lab"):
                file_path = os.path.join(root, filename)
                with open(file_path, "r") as file:
                    file_data = file.read()
                file_data = file_data.replace("SP", "pau")
                file_data = file_data.replace("br", "AP")
                with open(file_path, "w") as file:
                    file.write(file_data)

else:
    !7z x "$data_zip_path" -o{all_shits_not_wav_n_lab}


# for funny auto dict generator lmao
out = "/content/raw_data/custom_dict.txt"

phonemes = set()

def is_excluded(phoneme):
    return phoneme in ["pau", "AP", "SP"]

if data_type == "lab + wav (NNSVS format)":
    phoneme_folder_path = all_shits
    for root, dirs, files in os.walk(phoneme_folder_path):
        for file in files:
            if file.endswith(".lab"):
                fpath = os.path.join(root, file)
                with open(fpath, "r") as lab_file:
                    for line in lab_file:
                        line = line.strip()
                        if line:
                            phoneme = line.split()[2]
                            if not is_excluded(phoneme):
                                phonemes.add(phoneme)

with open(out, "w") as f:
    for phoneme in sorted(phonemes):
        f.write(phoneme + "	" + phoneme + "\n")

# for vowels and consonants.txt.... well adding liquid type for uta's script
dict_path = out
vowel_types = {"a", "i", "u", "e", "o", "N", "M", "NG"}
liquid_types = {"y", "w", "l", "r"} # r for english labels, it should be fine with jp too
vowel_data = []
consonant_data = []
liquid_data = []

with open(dict_path, "r") as f:
    for line in f:
        phoneme, _ = line.strip().split("\t")
        if phoneme[0] in vowel_types:
            vowel_data.append(phoneme)
        elif phoneme[0] in liquid_types:
            liquid_data.append(phoneme)
        else:
            consonant_data.append(phoneme)

vowel_data.sort()
liquid_data.sort()
consonant_data.sort()
directory = os.path.dirname(dict_path)

# make txt for language json file
vowel_txt_path = os.path.join(directory, "vowels.txt")
with open(vowel_txt_path, "w") as f:
    f.write(" ".join(vowel_data))
liquid_txt_path = os.path.join(directory, "liquids.txt")
with open(liquid_txt_path, "w") as f:
    f.write(" ".join(liquid_data))
consonant_txt_path = os.path.join(directory, "consonants.txt")
with open(consonant_txt_path, "w") as f:
    f.write(" ".join(consonant_data))


# here's a funny json append
with open(vowel_txt_path, "r") as f:
    vowel_data = f.read().split()
with open(liquid_txt_path, "r") as f:
    liquid_data = f.read().split()
with open(consonant_txt_path, "r") as f:
    consonant_data = f.read().split()
phones4json = {"vowels": vowel_data, "liquids": liquid_data}
with open("/content/nnsvs-db-converter/lang.sample.json", "w") as rawr:
    json.dump(phones4json, rawr, indent=4)


if data_type == "lab + wav (NNSVS format)":
    db_converter_script = "/content/nnsvs-db-converter/db_converter.py"
    for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):
        raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)
        if os.path.isdir(raw_folder_path):
            !python {db_converter_script} -s {max_silence_phoneme_amount} -S 60 -l {segment_length} ${export_lab} -mD -c -L "/content/nnsvs-db-converter/lang.sample.json" -w htk --folder {raw_folder_path}

if data_type == "lab + wav (NNSVS format)":
    for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):
        raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)
        !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab
        !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null
        !rm -rf {raw_folder_path}/diffsinger_db
        #!cp {raw_folder_path}/wavs/*.wav {raw_folder_path}

# make it replace the first SP to AP cus it seems like people always forgot about it
for root, _, files in os.walk(all_shits_not_wav_n_lab):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            with open(file_path, "r", newline="") as input_file:
                csv_reader = csv.reader(input_file)
                data = [row for row in csv_reader]
                header = data[0]
                if "ph_seq" in header:
                    ph_seq_index = header.index("ph_seq")
                    if len(data) > 1 and len(data[1]) > ph_seq_index:
                        data[1][ph_seq_index] = data[1][ph_seq_index].replace("SP", "AP", 1)
            with open(file_path, "w", newline="") as output_file:
                csv_writer = csv.writer(output_file)
                csv_writer.writerows(data)

print("解壓縮完成！")
print("\n")
print("資料打包中 ...")
!zip -q -9 -r {data_save_path}/data.zip /content/raw_data/*

In [None]:
#@markdown # 建置 OpenUTAU 聲音庫
#@markdown ___
#i need to clean this up it seems
#plan: add a build ou section here by inserting onnx paths (or just the folder containing the folders to the onnx files) to build ou
# ill have a config read function too so i dont have to add checkmark of if people train with embeds or shallow diff or not <3
# yes im lazy rawr x3
%cd /content
import os
import shutil
import yaml
from IPython.display import clear_output

constr_folder = "/content/OU_voicebank"
if not os.path.exists(constr_folder):
    os.makedirs(constr_folder)
else:
    shutil.rmtree(constr_folder)
    os.makedirs(constr_folder)

clear_output()

#@markdown <font size="-1.5">  **ACOUSTIC ONNX 資料夾**的儲存位置
acoustic_onnx_folder = "" #@param{type:"string'}
#@markdown <font size="-1.5"> Acoustic 模型的 config.yaml 儲存位置
acoustic_config = "" #@param{type:"string'}

#@markdown <font size="-1.5"> **VARIANCE ONNX 資料夾**的儲存位置
variance_onnx_folder = "" #@param{type:"string'}
#@markdown <font size="-1.5"> Variance 模型的 config.yaml 儲存位置
variance_config = "" #@param{type:"string'}

#@markdown <font size="-1.5"> 輸入你的「字詞→聲音」字典位置（留空的話會用預設的日文字典）
dictionary_path = "" #@param{type:"string"}

#@markdown <font size="-1.5"> 你想把 zip 檔案儲存到哪裡？
save_path = "" #@param{type:"string"}

#@markdown ___

#@markdown ## 角色設定（character.txt & character.yaml）

#@markdown <font size="-1.5"> **需要**顯示角色名稱
name = "" #@param{type:"string"}

print("正在複製資料 ...")
main_stuff = f"{constr_folder}/{name}"
if not os.path.exists(main_stuff):
    os.makedirs(main_stuff)
if not os.path.exists(f"{main_stuff}/dsmain"):
    os.makedirs(f"{main_stuff}/dsmain/embeds/acoustic")
    os.makedirs(f"{main_stuff}/dsmain/embeds/variance")
!cp {acoustic_onnx_folder}/acoustic.onnx {main_stuff}/dsmain
!cp {acoustic_onnx_folder}/phonemes.txt {main_stuff}/dsmain
!cp {acoustic_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/acoustic >/dev/null 2>&1
!cp {variance_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/variance >/dev/null 2>&1

if variance_onnx_folder:
    !cp {variance_onnx_folder}/linguistic.onnx {main_stuff}/dsmain
else:
    pass

print("\n")
print("正在寫入 character.txt ...")
with open(f"{main_stuff}/character.txt", "w") as file:
    file.write(f"name={name}\n")
    file.write("image=\n")
    file.write("author=\n")
    file.write("voice=\n")
    file.write("web=\n")

print("\n")
print("正在寫入 character.yaml ...")
with open(f"{main_stuff}/character.yaml", "w") as file:
    file.write("text_file_encoding: utf-8\n")
    file.write("portrait:\n")
    file.write("portrait_opacity: 0.45\n")
    file.write("default_phonemizer: OpenUtau.Core.DiffSinger.DiffSingerPhonemizer\n")
    file.write("singer_type: diffsinger\n")
acoustic_emb_files = os.listdir(acoustic_onnx_folder)
acoustic_embeds = []
acoustic_color_suffix = []
for file in acoustic_emb_files:
    if file.endswith(".emb"):
        acoustic_emb = os.path.splitext(file)[0]
        acoustic_embeds.append("dsmain/embeds/acoustic/" + acoustic_emb)
        acoustic_color_suffix.append(acoustic_emb)
subbanks = []
for i, (acoustic_embed_color, acoustic_embed_suffix) in enumerate(zip(acoustic_color_suffix, acoustic_embeds), start=1):
    color = f"{i:02}: {acoustic_embed_color}"
    suffix = f"{acoustic_embed_suffix}"
    subbanks.append({"color": color, "suffix": suffix})
if subbanks:
    with open(f"{main_stuff}/character.yaml", "r") as config:
        i_wanna_die_slash_j = yaml.safe_load(config)
    i_wanna_die_slash_j["subbanks"] = subbanks
    with open(f"{main_stuff}/character.yaml", "w") as config:
        yaml.dump(i_wanna_die_slash_j, config)

print("\n")
print("正在寫入 acoustic 的 dsconfig.yaml ...")
with open(f"{main_stuff}/dsconfig.yaml", "w") as file:
    file.write("phonemes: dsmain/phonemes.txt\n")
    file.write("acoustic: dsmain/acoustic.onnx\n")
    file.write("vocoder: nsf_hifigan\n")
    file.write("singer_type: diffsinger\n")
with open(acoustic_config, "r") as config:
    mfking_config = yaml.safe_load(config)
use_energy_embed = mfking_config.get("use_energy_embed")
use_breathiness_embed = mfking_config.get("use_breathiness_embed")
use_shallow_diffusion = mfking_config.get("use_shallow_diffusion")
max_depth = mfking_config.get("T_start")
speakers = mfking_config.get("speakers") #looking back here, why is this even here lmao cus i used acoustic_embeds instead of speakers
augmentation_arg = mfking_config.get("augmentation_args")
pitch_aug = mfking_config.get("use_key_shift_embed")
time_aug = mfking_config.get("use_speed_embed")
voicing = mfking_config.get("use_voicing_embed")
tension = mfking_config.get("use_tension_embed")
sample_rate = mfking_config.get("audio_sample_rate")
hop_size = mfking_config.get("hop_size")
win_size = mfking_config.get("win_size")
fft_size = mfking_config.get("fft_size")
num_mel_bins = mfking_config.get("audio_num_mel_bins")
mel_fmin = mfking_config.get("fmin")
mel_fmax = mfking_config.get("fmax")
mel_base = mfking_config.get("mel_base")

with open(f"{main_stuff}/dsconfig.yaml", "r") as config:
    why_are_there_so_many_i_could_prob_make_it_one = yaml.safe_load(config)
why_are_there_so_many_i_could_prob_make_it_one["use_energy_embed"] = use_energy_embed
why_are_there_so_many_i_could_prob_make_it_one["use_breathiness_embed"] = use_breathiness_embed
why_are_there_so_many_i_could_prob_make_it_one["use_variable_depth"] = use_shallow_diffusion
why_are_there_so_many_i_could_prob_make_it_one["max_depth"] = max_depth
why_are_there_so_many_i_could_prob_make_it_one["augmentation_args"] = augmentation_arg
why_are_there_so_many_i_could_prob_make_it_one["use_key_shift_embed"] = pitch_aug
why_are_there_so_many_i_could_prob_make_it_one["use_speed_embed"] = time_aug
why_are_there_so_many_i_could_prob_make_it_one["use_voicing_embed"] = voicing
why_are_there_so_many_i_could_prob_make_it_one["use_tension_embed"] = tension
why_are_there_so_many_i_could_prob_make_it_one["use_continuous_acceleration"] = True
why_are_there_so_many_i_could_prob_make_it_one["sample_rate"] = sample_rate
why_are_there_so_many_i_could_prob_make_it_one["hop_size"] = hop_size
why_are_there_so_many_i_could_prob_make_it_one["win_size"] = win_size
why_are_there_so_many_i_could_prob_make_it_one["fft_size"] = fft_size
why_are_there_so_many_i_could_prob_make_it_one["num_mel_bins"] = num_mel_bins
why_are_there_so_many_i_could_prob_make_it_one["fmin"] = mel_fmin
why_are_there_so_many_i_could_prob_make_it_one["fmax"] = mel_fmax
why_are_there_so_many_i_could_prob_make_it_one["mel_base"] = mel_base
why_are_there_so_many_i_could_prob_make_it_one["mel_scale"] = "slaney"


if subbanks:
    why_are_there_so_many_i_could_prob_make_it_one["speakers"] = acoustic_embeds
with open(f"{main_stuff}/dsconfig.yaml", "w") as config:
    yaml.dump(why_are_there_so_many_i_could_prob_make_it_one, config)


variance_emb_files = os.listdir(variance_onnx_folder)
variance_embeds = []
for file in variance_emb_files:
    if file.endswith(".emb"):
        variance_emb = os.path.splitext(file)[0]
        variance_embeds.append("../dsmain/embeds/variance/" + variance_emb)

print("\n")
print("正在寫入 dsdict.yaml...")
if not dictionary_path:
    dict_path = "/content/jpn_dict.txt"
else:
    dict_path = dictionary_path

# for symbols list
phoneme_dict_path = f"{acoustic_onnx_folder}/dictionary.txt"

dsdict = "dsdict.yaml"

def parse_phonemes(phonemes_str):
    return phonemes_str.split()

entries = []
vowel_types = {"a", "i", "u", "e", "o", "N", "M", "NG", "cl", "vf"}
vowel_data = []
stop_data = []

# Process the specified dictionary
with open(dict_path, "r") as f:
    for line in f:
        word, phonemes_str = line.strip().split("\t")
        phonemes = parse_phonemes(phonemes_str)
        if len(phonemes) == 1:
            entries.append({"grapheme": word, "phonemes": phonemes})
        else:
            entries.append({"grapheme": word, "phonemes": phonemes})

with open(phoneme_dict_path, "r") as f:
    for line in f:
        phoneme, _ = line.strip().split("\t")
        phoneme_type = "vowel" if phoneme[0] in vowel_types else "stop"
        entry = {"symbol": phoneme, "type": phoneme_type}
        if phoneme_type == "vowel":
            vowel_data.append(entry)
        else:
            stop_data.append(entry)

vowel_data.sort(key=lambda x: x["symbol"])
stop_data.sort(key=lambda x: x["symbol"])

dsdict_path = os.path.join(constr_folder, dsdict)
with open(dsdict_path, "w") as f:
    f.write("entries:\n")
    for entry in entries:
        f.write(f"- grapheme: {entry['grapheme']}\n")
        f.write("  phonemes:\n")
        for phoneme in entry["phonemes"]:
            f.write(f"  - {phoneme}\n")

    f.write("\nsymbols:\n")
    for entry in vowel_data + stop_data:
        f.write(f"- symbol: {entry['symbol']}\n")
        f.write(f"  type: {entry['type']}\n")

with open(variance_config, "r") as config:
    mfking_config = yaml.safe_load(config)
sample_rate = mfking_config.get("audio_sample_rate")
hop_size = mfking_config.get("hop_size")
predict_dur = mfking_config.get("predict_dur")
predict_pitch = mfking_config.get("predict_pitch")
use_melody_encoder = mfking_config.get("use_melody_encoder")
predict_voicing = mfking_config.get("predict_voicing")
predict_tension = mfking_config.get("predict_tension")
predict_energy = mfking_config.get("predict_energy")
predict_breathiness = mfking_config.get("predict_breathiness")

dur_onnx_path = variance_onnx_folder + "/dur.onnx"
if os.path.exists(dur_onnx_path):
    print("\n")
    print("making dsdur directory and necessary files...")
    os.makedirs(f"{main_stuff}/dsdur")
    !cp {dur_onnx_path} {main_stuff}/dsdur
    !cp {dsdict_path} {main_stuff}/dsdur
    with open(f"{main_stuff}/dsdur/dsconfig.yaml", "w") as file:
        file.write("phonemes: ../dsmain/phonemes.txt\n")
        file.write("linguistic: ../dsmain/linguistic.onnx\n")
        file.write("dur: dur.onnx\n")
    with open(f"{main_stuff}/dsdur/dsconfig.yaml", "r") as config:
        dsdur_config = yaml.safe_load(config)
    dsdur_config["use_continuous_acceleration"] = True
    dsdur_config["sample_rate"] = sample_rate
    dsdur_config["hop_size"] = hop_size
    dsdur_config["predict_dur"] = predict_dur
    if subbanks:
        dsdur_config["speakers"] = variance_embeds
    with open(f"{main_stuff}/dsdur/dsconfig.yaml", "w") as config:
        yaml.dump(dsdur_config, config)
else:
    print("\n")
    print("沒有找到 dur.onnx , 自動跳過建立 dsdur 資料夾")

pitch_onnx_path = variance_onnx_folder + "/pitch.onnx"
if os.path.exists(pitch_onnx_path):
    print("\n")
    print("正在建置 dspitch directory 還有其他的必要資料 ...")
    os.makedirs(f"{main_stuff}/dspitch")
    !cp {pitch_onnx_path} {main_stuff}/dspitch
    !cp {dsdict_path} {main_stuff}/dspitch
    with open(f"{main_stuff}/dspitch/dsconfig.yaml", "w") as file:
        file.write("phonemes: ../dsmain/phonemes.txt\n")
        file.write("linguistic: ../dsmain/linguistic.onnx\n")
        file.write("pitch: pitch.onnx\n")
        file.write("use_expr: true\n")
    with open(f"{main_stuff}/dspitch/dsconfig.yaml", "r") as config:
        dspitch_config = yaml.safe_load(config)
    dspitch_config["use_continuous_acceleration"] = True
    dspitch_config["sample_rate"] = sample_rate
    dspitch_config["hop_size"] = hop_size
    dspitch_config["predict_dur"] = predict_pitch
    if subbanks:
        dspitch_config["speakers"] = variance_embeds
    dspitch_config["use_note_rest"] = use_melody_encoder
    with open(f"{main_stuff}/dspitch/dsconfig.yaml", "w") as config:
        yaml.dump(dspitch_config, config)
else:
    print("\n")
    print("沒有找到 pitchonnx , 自動跳過建立 dspitch 資料夾")

variance_onnx_path = variance_onnx_folder + "/variance.onnx"
if os.path.exists(variance_onnx_path):
    print("\n")
    print("正在建置 dsvariance directory 還有其他的必要資料 ...")
    os.makedirs(f"{main_stuff}/dsvariance")
    !cp {variance_onnx_path} {main_stuff}/dsvariance
    !cp {dsdict_path} {main_stuff}/dsvariance
    with open(f"{main_stuff}/dsvariance/dsconfig.yaml", "w") as file:
        file.write("phonemes: ../dsmain/phonemes.txt\n")
        file.write("linguistic: ../dsmain/linguistic.onnx\n")
        file.write("variance: variance.onnx\n")
    with open(f"{main_stuff}/dsvariance/dsconfig.yaml", "r") as config:
        dsvariance_config = yaml.safe_load(config)
    dsvariance_config["use_continuous_acceleration"] = True
    dsvariance_config["sample_rate"] = sample_rate
    dsvariance_config["hop_size"] = hop_size
    dsvariance_config["predict_dur"] = True #this one will always be true cus if there's no variance model, it shouldnt make this folder in the first place
    dsvariance_config["predict_voicing"] = predict_voicing
    dsvariance_config["predict_tension"] = predict_tension
    dsvariance_config["predict_energy"] = predict_energy
    dsvariance_config["predict_breathiness"] = predict_breathiness
    if subbanks:
        dsvariance_config["speakers"] = variance_embeds
    with open(f"{main_stuff}/dsvariance/dsconfig.yaml", "w") as config:
        yaml.dump(dsvariance_config, config)
else:
    print("\n")
    print("沒有找到 variance.onnx , 自動跳過建立 dsvariance 資料夾")

!rm -rf {dsdict_path}
#im too lazy to write codes so ill just do this, itll only remove those folders if they're empty anyway
!rm -d {main_stuff}/dsmain/embeds/* >/dev/null 2>&1
!rm -d {main_stuff}/dsmain/embeds >/dev/null 2>&1

print("\n")
print("正在打包你的資料中 ...")
!zip -q -9 -r {save_path}/{name}.zip {main_stuff}/*

print("\n")
print("好欸！完成了！")

print("\n")
print("現在可以下載和安裝到 OpenUTAU 上了！ 如果還有什麼調整的 config  就去改吧！")