# Vocos
[![Package badge]][github]
[![Open In Colab]][notebook]

Author: [tarepan]

[github]:https://github.com/tarepan/vocos-official
[notebook]:https://colab.research.google.com/github/tarepan/vocos-official/blob/main/vocos.ipynb
[tarepan]:https://github.com/tarepan
[Package badge]:https://img.shields.io/badge/GitHub-vocos-9cf.svg
[Open In Colab]:https://colab.research.google.com/assets/colab-badge.svg

## Inference

### Setup

In [None]:
!pip uninstall -y vocos
!pip install git+https://github.com/tarepan/vocos-official

### Run

In [None]:
import multiprocessing

print(f"Cores: {multiprocessing.cpu_count()}")

Cores: 2


In [None]:
import torchaudio
from IPython.display import Audio, display
from vocos import Vocos
import librosa
import torch
import time


vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")

for p in ["./hiroshiba_normal_001.wav",	"./panda_clean001.wav", "./jvs010_VOICEACTRESS100_100.wav"]:
    y, sr = librosa.load(p, sr=24000, mono=True)
    y = torch.from_numpy(y).unsqueeze(0)

    t_start = time.perf_counter()
    y_hat = vocos(y)
    t_end   = time.perf_counter()

    t_synth = t_end - t_start
    t_y = len(y_hat[0]) / 24000
    print(f"Synthesized. {round(t_synth, 2)} [sec] for {round(t_y, 2)} [sec] audio, RTF={round(t_y/t_synth, 1)}")
    display(Audio(y,     rate=sr, normalize=True))
    display(Audio(y_hat, rate=sr, normalize=True))
    print("=========================================")


Synthesized. 0.34 [sec] for 6.31 [sec] audio, RTF=18.7


Synthesized. 0.41 [sec] for 5.87 [sec] audio, RTF=14.4


Synthesized. 0.61 [sec] for 10.5 [sec] audio, RTF=17.3




## Train

### Setup

In [None]:
# GoogleDrive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!git clone https://github.com/tarepan/vocos-official
%cd vocos-official

### Preprocessing

In [None]:
from pathlib import Path

import torch
import torchaudio
import numpy as np
import speechcorpusy


def preprocess(p: Path, sampling_rate: int):
    """Preprocess a waveform."""

    y, sr = torchaudio.load(p)

    # to-Mono :: (Channel, T) -> (1, T)
    if y.size(0) > 1:
        y = y.mean(dim=0, keepdim=True)

    # Resampling
    if sr != sampling_rate:
        y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=sampling_rate)

    return y


# Config
p_data = Path("./data")
corpus_name = "JVS"
sampling_rate = 24000
n_val = 10

p_data.mkdir(exist_ok=True)
corpus = speechcorpusy.load_preset(corpus_name, root="/content/gdrive/MyDrive/ML_data")
corpus.get_contents()
utters_per_spks = corpus.get_identities_per_speaker()
utters_train = sum(map(lambda utters_per_spk: utters_per_spk[:-1], utters_per_spks), [])
utters_eval  = sum(map(lambda utters_per_spk: utters_per_spk[-1:], utters_per_spks), [])[:n_val]

for utters, split in [(utters_train, "train"), (utters_eval, "eval")]:
    ps_processed: list[str] = []
    for item_id in utters:
        p = corpus.get_item_path(item_id)
        y = preprocess(p, sampling_rate)
        # Save
        p_y = p_data / p
        p_y.parent.mkdir(parents=True, exist_ok=True)
        torchaudio.save(p_y, y, sampling_rate, encoding="PCM_S", bits_per_sample=16)
        ps_processed.append(str(p_y))
    with open(f"./filelist.{split}", 'w') as f:
        f.write("\n".join(ps_processed))


### Run

In [None]:
!python train.py -c configs/vocos.yaml