# LinearVC demo

In [2]:
from pathlib import Path
import IPython.display as display
import linearvc
import torch
import torchaudio
import torchaudio.functional as F

device = "cuda"

In [3]:
# Load the WavLM feature extractor and HiFiGAN vocoder
wavlm = torch.hub.load(
    "bshall/knn-vc",
    "wavlm_large",
    trust_repo=True,
    progress=True,
    device=device,
)
hifigan, _ = torch.hub.load(
    "bshall/knn-vc",
    "hifigan_wavlm",
    trust_repo=True,
    prematched=True,
    progress=True,
    device=device,
)

Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


WavLM-Large loaded with 315,453,120 parameters.
Removing weight norm...
[HiFiGAN] Generator loaded with 16,523,393 parameters.


Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


In [4]:
linearvc_model = linearvc.LinearVC(wavlm, hifigan, device)

In [5]:
# Lists of source and target audio files
n_audio = 50
librispeech_dev_dir = Path("/home/kamperh/endgame/datasets/librispeech/LibriSpeech/dev-clean/")
source_wavs = sorted(list((librispeech_dev_dir / "1462").rglob("*.flac")))[1:n_audio + 1]
target_wavs = list((librispeech_dev_dir / "2086").rglob("*.flac"))[:n_audio]

In [12]:
# Features for the source input utterance
input_wav = librispeech_dev_dir / "1462/170138/1462-170138-0000.flac"
input_features = linearvc_model.get_features(input_wav)

wav, sr = torchaudio.load(input_wav)
display.Audio(wav, rate=sr)

In [13]:
# Reference audio example
wav, sr = torchaudio.load(target_wavs[0])
display.Audio(wav, rate=sr)

In [14]:
# The voice conversion projection matrix
W = linearvc_model.get_projmat(
    source_wavs,
    target_wavs,
    parallel=False,
    vad=False,
)

Source features:


100%|█████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 44.50it/s]


Target features:


100%|█████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 38.98it/s]


In [15]:
# Project the input and vocode
output_wav = linearvc_model.project_and_vocode(input_features, W)

In [16]:
display.Audio(output_wav, rate=linearvc_model.sr)