# kNN-VC and LinearVC experiments using all data

Herman Kamper, 2024

The source frames only come from the input utterance, i.e. no other speech from
the source speaker is used.

In [1]:
from datetime import datetime
from numpy import linalg
from pathlib import Path
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from tqdm.notebook import tqdm
import IPython.display as display
import numpy as np
import sys
import torch
import torchaudio

from utils import fast_cosine_dist

In [2]:
device = "cuda"

## Models

In [None]:
wavlm = torch.hub.load("bshall/knn-vc", "wavlm_large", trust_repo=True, device=device)

In [None]:
hifigan, _ = torch.hub.load("bshall/knn-vc", "hifigan_wavlm", trust_repo=True, device=device, prematched=True)

## LinearVC

In [None]:
exp_tag = "all_2024-09-17"
eval_csv = Path("data/speakersim_vctk_english.csv")
feats_dir = Path("/home/kamperh/scratch/vctk/wavlm")
wav_dir = Path("/home/kamperh/scratch/vctk/wav")
output_dir = Path(f"/home/kamperh/scratch/linearvc/vctk/{exp_tag}")

n_frames = 8192  # 15000
k_top = 1

In [None]:
feats_dict = {}
print("Reading from:", feats_dir)
for speaker_feats_fn in tqdm(sorted(feats_dir.glob("*.npy"))):
    speaker = speaker_feats_fn.stem
    feats_dict[speaker] = (
        torch.from_numpy(np.load(speaker_feats_fn))[:n_frames, :]
        .float()
        .to(device)
    )
print("No. speakers:", len(feats_dir))

In [None]:
# Projection matrices
projmats = {}
for source in tqdm(feats_dict):
    for target in tqdm(feats_dict, leave=False):
        if source == target:
            continue

        source_feats = feats_dict[source]
        target_feats = feats_dict[target]

        dists = fast_cosine_dist(source_feats, target_feats, device=device)
        best = dists.topk(k=k_top, largest=False, dim=-1)        
        linear_target = target_feats[best.indices].mean(dim=1)

        # W, _, _, _ = linalg.lstsq(source_feats.cpu(), linear_target.cpu())
        
        linear = Ridge(alpha=1e4, fit_intercept=False).fit(
            source_feats.squeeze().cpu(), linear_target.cpu()
        )
        W = linear.coef_.T

        W = torch.from_numpy(W).float().to(device)
        projmats[f"{source}-{target}"] = W

In [None]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")

            source_wav_fn = (
                wav_dir / source / Path(source_key).stem
            ).with_suffix(".wav")
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(
                    source_wav, output_layer=6
                )

            W_source_to_target = projmats[f"{source}-{target}"]

            source_to_target_feats = source_feats @ W_source_to_target

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            print(output_fn)
            assert False

In [None]:
class Arguments: pass
args = Arguments()
args.format = "vctk"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
)

# speaker_similarity(args)

## kNN-VC

In [None]:
exp_tag = "all_2024-09-17"
eval_csv = Path("data/speakersim_vctk_english.csv")
feats_dir = Path("/home/kamperh/scratch/vctk/wavlm")
wav_dir = Path("/home/kamperh/scratch/vctk/wav")
output_dir = Path(f"/home/kamperh/scratch/knnvc/vctk/{exp_tag}")

n_frames = None  # 15000
k_top = 4

In [None]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")

            # Features
            source_wav_fn = (
                wav_dir / source / Path(source_key).stem
            ).with_suffix(".wav")
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(
                    source_wav, output_layer=6
                )
            target_feats = feats_dict[target]

            # Matching
            dists = fast_cosine_dist(source_feats, target_feats, device=device)
            best = dists.topk(k=k_top, largest=False, dim=-1)
            source_to_target_feats = target_feats[best.indices].mean(dim=1)[None]

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            print(output_fn)
            assert False

In [None]:
class Arguments: pass
args = Arguments()
args.format = "vctk"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
)

# speaker_similarity(args)