## Preprocess Config

In [None]:
import json
import os
from random import shuffle

from loguru import logger
from tqdm import tqdm
from pathlib import Path

min_duration = 22050 * 2.0
max_duration = 22050 * 10.0

config_template = json.load(open("configs_template/config_template.json"))

training_files = [
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_borderlands2_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_baldursgate3_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_worldofwarcraft_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_mario_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/de_gametts_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/pl_archolos_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/de_borderlands2_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_warcraft_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_sqnarrator_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_emotional_train_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/de_emotional_train_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/ru_witcher3_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_witcher3_skyrim_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_fallout4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_naruto_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/de_kcd_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/pl_witcher3_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/de_diablo4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/en_diablo4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/fr_diablo4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/pl_diablo4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/ru_diablo4_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/ru_skyrim_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/jp_one_piece_xphone.csv",
      "/mnt/datasets/TTS_Data/metadata/filelists/xphoneBERT/jp_skyrim_xphone.csv",
      "/mnt/datasets/TTS_Data/fr/Fallout4/fr_fallout4_xphone.csv",
      "/mnt/datasets/TTS_Data/de/Fallout4/de_fallout4_xphone.csv",
      "/mnt/datasets/TTS_Data/en/Fallout4/en_fallout4_xphone.csv",
]

all_lines = []

for file in training_files:
    with open(file) as f:
        lines = f.readlines()
        all_lines.extend(lines)

len(all_lines)

In [None]:
wavs = []
train = []
val = []
spk_dict = {}
spk_id = 0
speaker_items_count = {}
duplicate_wavs = set()

shuffle(all_lines)

for line in tqdm(all_lines):
    cols = line.strip().split("|")
    speaker_name = cols[1]
    wav_path = cols[0]

    if not os.path.exists(wav_path):
        continue

    if not (max_duration >= (Path(wav_path).stat().st_size // 2) > min_duration):
        continue

    if speaker_name not in spk_dict:
        speaker_items_count[speaker_name] = 0
        spk_dict[speaker_name] = spk_id
        spk_id += 1
    else:
        speaker_items_count[speaker_name] += 1

    if (wav_path, speaker_name) in duplicate_wavs:
        continue

    if speaker_items_count[speaker_name] < 150:
        duplicate_wavs.add(wav_path)
        wavs.append((wav_path, speaker_name))

shuffle(wavs)

with open("/home/alexander/Projekte/so-vits-svc/filelists/voice_conversion_train.txt", "w") as f:
    for wav_path, speaker_name in wavs:
        speaker_id = spk_dict[speaker_name]
        f.write(f"{wav_path}|{speaker_id}\n")


config_template["spk"] = spk_dict
config_template["model"]["n_speakers"] = spk_id
config_template["model"]["speech_encoder"] = "vec768l12"


logger.info("Writing to configs/config_vc.json")
with open("configs/config_vc.json", "w") as f:
    json.dump(config_template, f, indent=2)

In [None]:
import json
import os
from random import shuffle

from loguru import logger
from tqdm import tqdm
from pathlib import Path
from glob import glob
import wave 

min_duration = 22050 * 1.0
max_duration = 22050 * 8.0
wavs = []
train = []
val = []
spk_dict = {}
spk_id = 6860
speaker_items_count = {}
duplicate_wavs = set()

all_wavs = glob("/mnt/datasets/TTS_Data/en/FF7/**/*.wav", recursive=True)

shuffle(all_wavs)

for file_path in tqdm(all_wavs):

    wav_path = file_path
    speaker_name = file_path.split("/")[-2]
    
    if "announcer" in speaker_name:
        continue

    if not os.path.exists(wav_path):
        continue

    # Open the WAV file
    with wave.open(wav_path, 'r') as wav_file:
        # Get the number of frames and the frame rate
        frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()

        # Calculate the duration in seconds
        duration_seconds = frames / float(frame_rate)

    if not (8.0 >= duration_seconds > 1.0):
        continue

    if speaker_name not in spk_dict:
        speaker_items_count[speaker_name] = 0
        spk_dict[speaker_name] = spk_id
        spk_id += 1
    else:
        speaker_items_count[speaker_name] += 1

    if (wav_path, speaker_name) in duplicate_wavs:
        continue

    if speaker_items_count[speaker_name] < 200:
        duplicate_wavs.add(wav_path)
        wavs.append((wav_path, speaker_name))

shuffle(wavs)

with open("/home/alexander/Projekte/so-vits-svc/filelists/voice_conversion_train_ff7.txt", "w") as f:
    for wav_path, speaker_name in wavs:
        speaker_id = spk_dict[speaker_name]
        f.write(f"{wav_path}|{speaker_id}\n")

In [None]:
with open("/home/alexander/Projekte/so-vits-svc/filelists/voice_conversion_train.txt", "r") as f:
    lines = f.readlines()
    
# get max speaker id
max_speaker_id = 0
for line in lines:
    cols = line.strip().split("|")
    wav_path = cols[0]
    speaker_id = cols[1]
    
    if int(speaker_id) > max_speaker_id:
        max_speaker_id = int(speaker_id)

max_speaker_id

## Preprocess F0 and Hubert

In [None]:
file_paths = []
for file, speaker in wavs:
    file_paths.append(file)

print(len(file_paths))

In [None]:
def find_duplicates(lst):
    seen = set()
    duplicates = set()

    for sublist in lst:
        # Convert the list into a tuple to make it hashable
        t = tuple(sublist)

        if t in seen:
            duplicates.add(t)
        seen.add(t)

    return list(duplicates)


dups = find_duplicates(wavs)
dups

In [None]:
de_duped = []
with open(
    "/home/alexander/Projekte/so-vits-svc/filelists/voice_conversion_train.txt", "r"
) as rf:
    for line in rf:
        fil = line.strip().split("|")[0]
        if not any(fil in dup for dup, speak in dups):
            de_duped.append(line)

with open(
    "/home/alexander/Projekte/so-vits-svc/filelists/voice_conversion_train.txt", "w"
) as wf:
    for line in de_duped:
        wf.write(line)

In [None]:
import utils
import librosa
import torch
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import torch.multiprocessing as mp
from tqdm import tqdm

sampling_rate = 22050
hop_length = 256
speech_encoder = "vec768l12"
device = "cuda:0"
f0p = "crepe"

save_path = "/mnt/datasets/VC_Data"


def process_one(filename, hmodel, f0p, device, diff=False, mel_extractor=None):
    filename, speaker = filename
    wav, sr = librosa.load(filename, sr=sampling_rate)
    audio_norm = torch.FloatTensor(wav)
    audio_norm = audio_norm.unsqueeze(0)

    # get only the filename without path and without the extension
    only_filename = os.path.splitext(os.path.basename(filename))[0]

    soft_path = os.path.join(save_path, only_filename + f"_{speaker}_.soft.pt")
    if not os.path.exists(soft_path):
        wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
        wav16k = torch.from_numpy(wav16k).to(device)
        c = hmodel.encoder(wav16k)
        torch.save(c.cpu(), soft_path)

    f0_path = filename.replace(".wav", ".pitch.pt")
    if not os.path.exists(f0_path):
        f0_predictor = utils.get_f0_predictor(
            f0p,
            sampling_rate=sampling_rate,
            hop_length=hop_length,
            device=device,
            threshold=0.05,
        )

        f0, uv = f0_predictor.compute_f0_uv(wav)

        # Assuming f0 and uv are numpy arrays
        f0_tensor = torch.from_numpy(f0)
        uv_tensor = torch.from_numpy(uv)

        # Save as a dictionary for clarity
        data_to_save = {"f0": f0_tensor, "uv": uv_tensor}
        torch.save(data_to_save, f0_path)

        # np.save(f0_path, np.asanyarray((f0, uv), dtype=object))


def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
    logger.info("Loading speech encoder for content...")
    rank = mp.current_process()._identity
    rank = rank[0] if len(rank) > 0 else 0
    if torch.cuda.is_available():
        gpu_id = rank % torch.cuda.device_count()
        device = torch.device(f"cuda:{gpu_id}")
    logger.info(f"Rank {rank} uses device {device}")
    hmodel = utils.get_speech_encoder(speech_encoder, device=device)
    logger.info(f"Loaded speech encoder for rank {rank}")
    for filename in tqdm(file_chunk, position=rank):
        process_one(filename, hmodel, f0p, device, diff, mel_extractor)


def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        tasks = []
        for i in range(num_processes):
            start = int(i * len(filenames) / num_processes)
            end = int((i + 1) * len(filenames) / num_processes)
            file_chunk = filenames[start:end]
            tasks.append(
                executor.submit(
                    process_batch, file_chunk, f0p, diff, mel_extractor, device=device
                )
            )
        for task in tqdm(tasks, position=0):
            task.result()


parallel_process(wavs, 7, f0p, False, None, device)

In [None]:
from glob import glob
import wave
from tqdm import tqdm
import os

all_wavs = glob("/mnt/datasets/TTS_Data/en/FF7/**/*.wav", recursive=True)

for wav_path in tqdm(all_wavs):
    # Open the WAV file
    with wave.open(wav_path, 'r') as wav_file:
        # Get the number of frames and the frame rate
        frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()

        # Calculate the duration in seconds
        duration_seconds = frames / float(frame_rate)


        if not (7.0 >= duration_seconds > 1.0):
            os.remove(wav_path)