In [1]:
import os
import sys
import numpy as np

import torch, torchaudio

sys.path.append('../training')
from utils import audio_utils, data_utils

# Hubert Soft pretrained model

In [2]:
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

Using cache found in /root/.cache/torch/hub/bshall_hubert_main


# Save HuBert Soft embedding

## VCTK

In [16]:
audio_dir = "/data1/vctk/modified/wav16_cleaned"
audio_folders = os.listdir(os.path.join(audio_dir))
cnt = 0
for i, audio_folder in enumerate(audio_folders):
    audio_names = os.listdir(os.path.join(audio_dir, audio_folder))
    for j, audio_name in enumerate(audio_names):
        audio_path = os.path.join(audio_dir, audio_folder, audio_name)
        save_path = audio_path.replace("wav16_cleaned", "hubert").replace('.wav', '.emb')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        wav, sr = torchaudio.load(audio_path)
        assert sr == 16000
        wav = wav.unsqueeze(0).cuda()
        units = hubert.units(wav)
        torch.save(units.cpu(), save_path)
        print(f"\r{i}/{len(audio_folders)} of {j}/{len(audio_names)} - {np.round((cnt/43342*100),2)}%", end="")
        cnt += 1

## VoxCeleb

In [6]:
audio_dir = "/disk2/VoxCeleb2/VoxCeleb2/original"
audio_folders = os.listdir(os.path.join(audio_dir))
cnt = 0
for i, audio_folder in enumerate(audio_folders):
    audio_names = os.listdir(os.path.join(audio_dir, audio_folder))
    for j, audio_name in enumerate(audio_names):
        audio_path = os.path.join(audio_dir, audio_folder, audio_name)
        save_path = audio_path.replace("original", "modified/hubert_soft").replace('.wav', '.emb')
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        wav, sr = torchaudio.load(audio_path)
        assert sr == 16000
        wav = wav.unsqueeze(0).cuda()
        units = hubert.units(wav)
        torch.save(units.cpu(), save_path)
        print(f"\r{i}/{len(audio_folders)} of {j}/{len(audio_names)} - {np.round((cnt/1128246*100),2)}%", end="")
        cnt += 1

6111/6112 of 178/179 - 100.0%

## Temp (vctk filelists making)

In [4]:
test_speakers = ['p261', 'p225', 'p294', 'p347', 'p238', 'p234', 'p248', 'p335', 'p245', 'p326', 'p302']
print(len(test_speakers))

11


In [10]:
typ_ = "train" # "train" or "valid" or "test"
train_speakers = data_utils.load_text(f"../training/filelists/vctk_speaker_{typ_}.txt")

typ_ = "valid" # "train" or "valid" or "test"
valid_speakers = data_utils.load_text(f"../training/filelists/vctk_speaker_{typ_}.txt")

typ_ = "test" # "train" or "valid" or "test"
test_speakers = data_utils.load_text(f"../training/filelists/vctk_speaker_{typ_}.txt")
print(len(train_speakers), len(valid_speakers), len(test_speakers))

88 10 10


In [11]:
# save
data_root = "/disk2/vctk/modified/wav16_cleaned"

f = open("../training/filelists/vctk_train.txt", 'w')
data = []
for speaker in train_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()

f = open("../training/filelists/vctk_valid.txt", 'w')
for speaker in valid_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()

f = open("../training/filelists/vctk_test.txt", 'w')
for speaker in test_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()

## Temp (VoxCeleb filelists making)

In [12]:
typ_ = "train" # "train" or "valid" or "test"
train_speakers = data_utils.load_text(f"../training/filelists/VGG_Face/VGG_Face_Spk_{typ_}.txt")

typ_ = "valid" # "train" or "valid" or "test"
valid_speakers = data_utils.load_text(f"../training/filelists/VGG_Face/VGG_Face_Spk_{typ_}.txt")

typ_ = "test" # "train" or "valid" or "test"
test_speakers = data_utils.load_text(f"../training/filelists/VGG_Face/VGG_Face_Spk_{typ_}.txt")
print(len(train_speakers), len(valid_speakers), len(test_speakers))

5891 100 118


In [15]:
# save
data_root = "/disk2/VoxCeleb2/VoxCeleb2/original"

f = open("../training/filelists/vox_train.txt", 'w')
data = []
for speaker in train_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()

f = open("../training/filelists/vox_valid.txt", 'w')
for speaker in valid_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()

f = open("../training/filelists/vox_test.txt", 'w')
for speaker in test_speakers:
    files = os.listdir(os.path.join(data_root, speaker))
    for file in files:
        data = os.path.join(data_root, speaker, file)
        f.write(data + '\n')
f.close()