# kNN-VC and LinearVC experiments using all data

Herman Kamper, 2024

In [1]:
from datetime import datetime
from numpy import linalg
from pathlib import Path
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from tqdm.notebook import tqdm
import celer
import IPython.display as display
import numpy as np
import sys
import torch
import torchaudio

from utils import fast_cosine_dist

In [2]:
device = "cuda"

## Models

In [3]:
wavlm = torch.hub.load("bshall/knn-vc", "wavlm_large", trust_repo=True, device=device)

Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


WavLM-Large loaded with 315,453,120 parameters.


In [4]:
hifigan, _ = torch.hub.load("bshall/knn-vc", "hifigan_wavlm", trust_repo=True, device=device, prematched=True)

Removing weight norm...
[HiFiGAN] Generator loaded with 16,523,393 parameters.


Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


## LinearVC

In [5]:
exp_tag = "2024-09-22"
subset = "test-clean"
eval_csv = Path(f"data/speaker-sim-{subset}.csv")
feats_dir = Path(f"/home/kamperh/scratch/{subset}/wavlm")
wav_dir = Path(f"/home/kamperh/endgame/datasets/librispeech/LibriSpeech/{subset}")
output_dir = Path(f"/home/kamperh/scratch/linearvc/{subset}/{exp_tag}")

n_frames = 8192  # 15000
k_top = 1

In [6]:
feats_dict = {}
print("Reading from:", feats_dir)
for speaker_feats_fn in tqdm(sorted(feats_dir.glob("*.npy"))):
    speaker = speaker_feats_fn.stem
    # feats_dict[speaker] = np.load(speaker_feats_fn)[:n_frames, :]
    feats_dict[speaker] = np.load(speaker_feats_fn)
    # feats_dict[speaker] = (
    #     torch.from_numpy(np.load(speaker_feats_fn))[:n_frames, :]
    #     .float()
    #     .to(device)
    # )
print("No. speakers:", len(feats_dict))

Reading from: /home/kamperh/scratch/test-clean/wavlm


  0%|          | 0/40 [00:00<?, ?it/s]

No. speakers: 40


In [7]:
# Projection matrices
projmats = {}
for source in tqdm(feats_dict):
    for target in tqdm(feats_dict, leave=False):
        if source == target:
            continue

        source_feats = feats_dict[source][:n_frames, :]
        # target_feats = feats_dict[target][:n_frames, :]
        target_feats = feats_dict[target]
        source_feats = torch.from_numpy(source_feats).float().to(device)
        target_feats = torch.from_numpy(target_feats).float().to(device)

        dists = fast_cosine_dist(source_feats, target_feats, device=device)
        best = dists.topk(k=k_top, largest=False, dim=-1)        
        linear_target = target_feats[best.indices].mean(dim=1)

        W, _, _, _ = linalg.lstsq(source_feats.cpu(), linear_target.cpu())

        # linear = celer.Lasso(alpha=0.3, fit_intercept=False).fit(
        #     source_feats.squeeze().cpu(), linear_target.cpu()
        # )
        # W = linear.coef_.T
        
        # linear = Ridge(alpha=1e4, fit_intercept=False).fit(
        #     source_feats.squeeze().cpu(), linear_target.cpu()
        # )
        # W = linear.coef_.T

        # W = torch.from_numpy(W).float().to(device)
        projmats[f"{source}-{target}"] = W

        # assert False

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  W, _, _, _ = linalg.lstsq(source_feats.cpu(), linear_target.cpu())


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

In [10]:
output_dir.mkdir(parents=True, exist_ok=True)
npz_fn = output_dir / f"projmats_{n_frames}.npz"

print("Writing:", npz_fn)
np.savez_compressed(npz_fn, **projmats)

# print("Reading:", npz_fn)
# projmats = np.load(npz_fn)

Writing: /home/kamperh/scratch/linearvc/test-clean/2024-09-21/projmats_4096.npz


In [8]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")
            # (source, target, source_key, _, _) = line.split(",")  # VCTK

            source_key_split = source_key.split("-")
            source_wav_fn = (
                wav_dir
                / source_key_split[0]
                / source_key_split[1]
                / source_key.split("/")[0]
            ).with_suffix(".flac")
            # # VCTK
            # source_wav_fn = (
            #     wav_dir / source / Path(source_key).stem
            # ).with_suffix(".wav")
            
            # Features
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(
                    source_wav, output_layer=6
                )
            target_feats = feats_dict[target]
            target_feats = torch.from_numpy(target_feats).float().to(device)
            
            W_source_to_target = projmats[f"{source}-{target}"]

            # # Matching
            # dists = fast_cosine_dist(
            #     source_feats.squeeze(), target_feats, device=device
            # )            
            # best = dists.topk(k=k_top, largest=False, dim=-1)        
            # linear_target = target_feats[best.indices].mean(dim=1)
            # W_source_to_target, _, _, _ = linalg.lstsq(
            #     source_feats.squeeze().cpu(), linear_target.cpu()
            # )
            # # linear = celer.Lasso(alpha=0.3, fit_intercept=False).fit(
            # #     source_feats.squeeze().cpu(), linear_target.cpu()
            # # )
            # # W_source_to_target = linear.coef_.T
                       
            W_source_to_target = (
                torch.from_numpy(W_source_to_target).float().to(device)
            )            
            source_to_target_feats = source_feats @ W_source_to_target

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            # print(output_fn)
            # assert False

Writing to: /home/kamperh/scratch/linearvc/test-clean/2024-09-22


  0%|          | 0/15601 [00:00<?, ?it/s]

  return F.conv1d(input, weight, bias, self.stride,


In [9]:
class Arguments: pass
args = Arguments()
args.format = "librispeech"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" {args.groundtruth_dir}"
    # f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
)

# speaker_similarity(args)

Run:
./speaker_similarity.py --format librispeech data/speaker-sim-test-clean.csv /home/kamperh/scratch/linearvc/test-clean/2024-09-22 /home/kamperh/endgame/datasets/librispeech/LibriSpeech/test-clean
./intelligibility.py --format librispeech /home/kamperh/scratch/linearvc/test-clean/2024-09-22 /home/kamperh/endgame/datasets/librispeech/LibriSpeech/test-clean


Results:

               eer
    mean  0.337051
    std   0.097225

    WER: 4.93% +- 0.06%
    CER: 2.55% +- 0.03%

## kNN-VC

In [7]:
exp_tag = "2024-09-17"
subset = "test-clean"
eval_csv = Path(f"data/speaker-sim-{subset}.csv")
# eval_csv = Path("data/speakersim_vctk_english.csv")
feats_dir = Path(f"/home/kamperh/scratch/{subset}/wavlm")
wav_dir = Path(f"/home/kamperh/endgame/datasets/librispeech/LibriSpeech/{subset}")
output_dir = Path(f"/home/kamperh/scratch/knnvc/{subset}/{exp_tag}")

n_frames = None  # 15000
k_top = 4

In [16]:
feats_dict = {}
print("Reading from:", feats_dir)
for speaker_feats_fn in tqdm(sorted(feats_dir.glob("*.npy"))):
    speaker = speaker_feats_fn.stem
    feats_dict[speaker] = (
        torch.from_numpy(np.load(speaker_feats_fn))[:n_frames, :]
        .float()
        .to(device)
    )
print("No. speakers:", len(feats_dict))

Reading from: /home/kamperh/scratch/test-clean/wavlm


  0%|          | 0/40 [00:00<?, ?it/s]

No. speakers: 40


In [19]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")
            # (source, target, source_key, _, _) = line.split(",")  # VCTK

            source_key_split = source_key.split("-")
            source_wav_fn = (
                wav_dir
                / source_key_split[0]
                / source_key_split[1]
                / source_key.split("/")[0]
            ).with_suffix(".flac")
            # # VCTK
            # source_wav_fn = (
            #     wav_dir / source / Path(source_key).stem
            # ).with_suffix(".wav")
            
            # Features
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(
                    source_wav, output_layer=6
                )
            target_feats = feats_dict[target]

            # Matching
            dists = fast_cosine_dist(
                source_feats.squeeze(), target_feats, device=device
            )
            best = dists.topk(k=k_top, largest=False, dim=-1)
            source_to_target_feats = target_feats[best.indices].mean(dim=1)[None]

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            # print(output_fn)
            # assert False

Writing to: /home/kamperh/scratch/knnvc/test-clean/2024-09-17


  0%|          | 0/15601 [00:00<?, ?it/s]

In [8]:
class Arguments: pass
args = Arguments()
args.format = "librispeech"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" {args.groundtruth_dir}"    
    # f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"   
)

# speaker_similarity(args)

Run:
./speaker_similarity.py --format librispeech data/speaker-sim-test-clean.csv /home/kamperh/scratch/knnvc/test-clean/2024-09-17 /home/kamperh/endgame/datasets/librispeech/LibriSpeech/test-clean
./intelligibility.py --format librispeech /home/kamperh/scratch/knnvc/test-clean/2024-09-17 /home/kamperh/endgame/datasets/librispeech/LibriSpeech/test-clean


Results:

               eer
    mean  0.389744
    std   0.094324

    WER: 5.57% +- 0.07%
    CER: 2.87% +- 0.04%