# kNN-VC and LinearVC experiments using parallel data

Herman Kamper, 2024

In [2]:
from datetime import datetime
from numpy import linalg
from pathlib import Path
from tqdm.notebook import tqdm
import celer
import IPython.display as display
import numpy as np
import sys
import torch
import torchaudio

from reduced_rank_ridge import ReducedRankRidge
from utils import fast_cosine_dist

In [3]:
from resample_vad import speakers
device = "cuda"

## Models

In [4]:
wavlm = torch.hub.load("bshall/knn-vc", "wavlm_large", trust_repo=True, device=device)

Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


WavLM-Large loaded with 315,453,120 parameters.


In [5]:
hifigan, _ = torch.hub.load("bshall/knn-vc", "hifigan_wavlm", trust_repo=True, device=device, prematched=True)

Removing weight norm...
[HiFiGAN] Generator loaded with 16,523,393 parameters.


Using cache found in /home/kamperh/.cache/torch/hub/bshall_knn-vc_master


## LinearVC using parallel utterances (single)

In [4]:
wav_dir = Path("/home/kamperh/scratch/vctk/wav/")
k_top = 1

In [18]:
# Projection matrices

source = "p225"  # Southern English
target = "p226"  # Surrey
# target = "p232"  # Southern English
# target = "p228"  # Southern English
# target = "p234"  # Scottish
# target = "p323"  # South African
# target = "p347"  # South African
# target = "p376"  # Indian

# source_wav_fn = wav_dir / source / f"{source}_002.wav"
# target_wav_fn = wav_dir / target / f"{target}_002.wav"
source_wav_fn = wav_dir / source / f"{source}_008.wav"
target_wav_fn = wav_dir / target / f"{target}_008.wav"
# source_wav_fn = wav_dir / source / f"{source}_023.wav"
# target_wav_fn = wav_dir / target / f"{target}_023.wav"

# Features
source_wav, _ = torchaudio.load(source_wav_fn)
source_wav = source_wav.to(device)
target_wav, _ = torchaudio.load(target_wav_fn)
target_wav = target_wav.to(device)
with torch.inference_mode():
    source_feats, _ = wavlm.extract_features(source_wav, output_layer=6)
    target_feats, _ = wavlm.extract_features(target_wav, output_layer=6)
source_feats = source_feats.squeeze()
target_feats = target_feats.squeeze()
# print("source_feats shape", source_feats.shape)
# print("target_feats shape", target_feats.shape)\

# Matching
dists = fast_cosine_dist(source_feats, target_feats, device=device)
best = dists.topk(k=k_top, largest=False, dim=-1)        
linear_target = target_feats[best.indices].mean(dim=1)

# # Lasso regression
# linear = celer.Lasso(alpha=0.3, fit_intercept=False).fit(
#     source_feats.squeeze().cpu(), linear_target.cpu()
# )
# W = linear.coef_.T

# Regularised ridge regression
rank = 100
linear = ReducedRankRidge(alpha=1.0, fit_intercept=False, rank=rank).fit(
    source_feats.squeeze().cpu(), linear_target.cpu(),
)
W = linear.Vr @ linear.Vrt_times_beta
W = W.T

# Matrix properties
tol = 1e-4
print("Matrix rank:", linalg.matrix_rank(W, tol=tol))
print(
    "No. parameters for low rank: {:,d}".format(
        linalg.matrix_rank(W, tol=tol) * W.shape[0] * 2
    )
)
print(
    "Proportion zero: {:.2f}%".format(
        len(np.where(np.abs(W) < tol)[0]) / W.shape[0] ** 2 * 100
    )
)
print(
    "No. non-zero elements: {:,d} out of {:,d}".format(
        len(np.where(np.abs(W) > tol)[0]),
        W.shape[0] * W.shape[1]
    )
)

W = torch.from_numpy(W).float().to(device)

Matrix rank: 100
No. parameters for low rank: 204,800
Proportion zero: 0.36%
No. non-zero elements: 1,044,824 out of 1,048,576


In [19]:
display.Audio(source_wav.squeeze().cpu(), rate=16000)

In [20]:
display.Audio(target_wav.squeeze().cpu(), rate=16000)

In [21]:
wav_fn = wav_dir / source / f"{source}_057.wav"
# wav_fn = wav_dir / source / f"{source}_051.wav"
wav, _ = torchaudio.load(wav_fn)
wav = wav.to(device)
# wav = F.vad(wav, 16000)
display.Audio(wav.squeeze().cpu(), rate=16000)

In [22]:
with torch.inference_mode():
    feats, _ = wavlm.extract_features(
        wav, output_layer=6
    )

source_to_target_feats = feats @ W

with torch.inference_mode():
    wav_hat = hifigan(source_to_target_feats).squeeze(0)

In [23]:
display.Audio(wav_hat.squeeze().cpu(), rate=16000)

In [195]:
display.Audio(wav_hat.squeeze().cpu(), rate=16000)

## LinearVC using parallel utterances (dataset)

Only a single parallel utterance pair is used.

In [4]:
# exp_tag = "2024-09-16"
exp_tag = "2024-11-07_rrr"
eval_csv = Path("data/speakersim_vctk_english.csv")
wav_dir = Path("/home/kamperh/scratch/vctk/wav")
output_dir = Path(f"/home/kamperh/scratch/linearvc/vctk/{exp_tag}")

k_top = 1
# parallel_utt = "023"
parallel_utt = "008"

output_dir.mkdir(parents=True, exist_ok=True)

In [5]:
# Projection matrices
projmats = {}
for source in tqdm(sorted(speakers)):
    for target in tqdm(sorted(speakers), leave=False):
        if source == target:
            continue

        # Features
        source_wav_fn = wav_dir / source / f"{source}_{parallel_utt}.wav"
        target_wav_fn = wav_dir / target / f"{target}_{parallel_utt}.wav"
        source_wav, _ = torchaudio.load(source_wav_fn)
        source_wav = source_wav.to(device)
        target_wav, _ = torchaudio.load(target_wav_fn)
        target_wav = target_wav.to(device)
        with torch.inference_mode():
            source_feats, _ = wavlm.extract_features(source_wav, output_layer=6)
            target_feats, _ = wavlm.extract_features(target_wav, output_layer=6)
        source_feats = source_feats.squeeze()
        target_feats = target_feats.squeeze()

        # Matching without DTW
        dists = fast_cosine_dist(source_feats, target_feats, device=device)
        best = dists.topk(k=k_top, largest=False, dim=-1)        
        linear_target = target_feats[best.indices].mean(dim=1)

        # # Matching with DTW
        # source_feats_np = source_feats.cpu().numpy()
        # target_feats_np = target_feats.cpu().numpy()
        # s = np.ascontiguousarray(np.float64(source_feats_np))
        # t = np.ascontiguousarray(np.float64(target_feats_np))
        # path, _ = _dtw.multivariate_dtw(s, t, "cosine")
        # path.reverse()
        # source_path, target_path = zip(*path)
        # i_frame = 0
        # linear_target_idx = []
        # for i_source, i_target in path:
        #     if i_source == i_frame:
        #         linear_target_idx.append(i_target)
        #         i_frame += 1
        # linear_target = target_feats_np[linear_target_idx, :]
        # linear_target = torch.from_numpy(linear_target).float()

        # Unregularised
        # W, _, _, _ = linalg.lstsq(source_feats.cpu(), linear_target.cpu())

        # Ridge
        # linear = Ridge(alpha=5e3, fit_intercept=False).fit(
        #     source_feats.squeeze().cpu(), linear_target.cpu()
        # )
        # W = linear.coef_.T

        # Lasso
        linear = celer.Lasso(alpha=0.3, fit_intercept=False).fit(
            source_feats.squeeze().cpu(), linear_target.cpu()
        )
        W = linear.coef_.T

        # # Regularised ridge
        # rank = 10
        # linear = ReducedRankRidge(alpha=1.0, fit_intercept=False, rank=rank).fit(
        #     source_feats.squeeze().cpu(), linear_target.cpu(),
        # )
        # W = linear.Vr @ linear.Vrt_times_beta
        # W = W.T
        
        W = torch.from_numpy(W).float().to(device)
        projmats[f"{source}-{target}"] = W

    #     break
    # break

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  return F.conv1d(input, weight, bias, self.stride,


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [25]:
print("Writing:", output_dir / "projmats.pt")
torch.save(projmats, output_dir / "projmats.pt")

# # print("Reading:", output_dir / "projmats.pt")
# # projmats = torch.load(output_dir / "projmats.pt")

Writing: /home/kamperh/scratch/linearvc/vctk/2024-11-07_rrr/projmats.pt


In [7]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")

            source_wav_fn = (
                wav_dir / source / Path(source_key).stem
            ).with_suffix(".wav")
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(
                    source_wav, output_layer=6
                )

            W_source_to_target = projmats[f"{source}-{target}"]

            source_to_target_feats = source_feats @ W_source_to_target

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            # print(output_fn)
            # assert False

Writing to: /home/kamperh/scratch/linearvc/vctk/2024-11-07_rrr


  0%|          | 0/9301 [00:00<?, ?it/s]

In [8]:
class Arguments: pass
args = Arguments()
args.format = "vctk"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
)

# speaker_similarity(args)

Run:
./speaker_similarity.py --format vctk data/speakersim_vctk_english.csv /home/kamperh/scratch/linearvc/vctk/2024-11-07_rrr /home/kamperh/scratch/vctk/wav
./intelligibility.py --format vctk /home/kamperh/scratch/linearvc/vctk/2024-11-07_rrr /home/kamperh/endgame/datasets/VCTK-Corpus/txt/


Results with lasso (alpha=0.3):

               eer
    mean  0.314409
    std   0.083072

    WER: 7.58% +- 0.22%
    CER: 6.93% +- 0.12%

## kNN-VC using single utterance as reference (dataset)

The single utterance here is the one item in the parallel utterance pairs used for LinearVC above.

In [6]:
exp_tag = "2024-09-16"
eval_csv = Path("data/speakersim_vctk_english.csv")
wav_dir = Path("/home/kamperh/scratch/vctk/wav")
output_dir = Path(f"/home/kamperh/scratch/knnvc/vctk/{exp_tag}")

k_top = 4
parallel_utt = "008"
# parallel_utt = "023"

In [7]:
output_dir.mkdir(parents=True, exist_ok=True)
print("Writing to:", output_dir)
with open(eval_csv) as f:
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line[-1] == "0":
            (source, target, source_key, _, _) = line.split(",")

            # Features
            source_wav_fn = (
                wav_dir / source / Path(source_key).stem
            ).with_suffix(".wav")
            target_wav_fn = (
                wav_dir / target / f"{target}_{parallel_utt}"
            ).with_suffix(".wav")
            source_wav, _ = torchaudio.load(source_wav_fn)
            source_wav = source_wav.to(device)
            target_wav, _ = torchaudio.load(target_wav_fn)
            target_wav = target_wav.to(device)            
            with torch.inference_mode():
                source_feats, _ = wavlm.extract_features(source_wav, output_layer=6)
                target_feats, _ = wavlm.extract_features(target_wav, output_layer=6)
            source_feats = source_feats.squeeze()
            target_feats = target_feats.squeeze()

            # Matching
            dists = fast_cosine_dist(source_feats, target_feats, device=device)
            best = dists.topk(k=k_top, largest=False, dim=-1)
            source_to_target_feats = target_feats[best.indices].mean(dim=1)[None]

            with torch.inference_mode():
                wav_hat = hifigan(source_to_target_feats).squeeze(0)            

            cur_output_dir = Path(output_dir) / source_key.split("/")[0]
            cur_output_dir.mkdir(parents=True, exist_ok=True)
            output_fn = (cur_output_dir / source_key.split("/")[1]).with_suffix(
                ".wav"
            )
            torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)

            # print(output_fn)
            # assert False

Writing to: /home/kamperh/scratch/knnvc/vctk/2024-09-16


  0%|          | 0/9301 [00:00<?, ?it/s]

  return F.conv1d(input, weight, bias, self.stride,


In [9]:
class Arguments: pass
args = Arguments()
args.format = "vctk"
args.eval_csv = eval_csv
args.converted_dir = output_dir
args.groundtruth_dir = wav_dir

print("Run:")
print(
    f"./speaker_similarity.py --format {args.format}"
    f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
)
print(
    f"./intelligibility.py --format {args.format} {args.converted_dir}"
    f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
)

# speaker_similarity(args)

Run:
./speaker_similarity.py --format vctk data/speakersim_vctk_english.csv /home/kamperh/scratch/knnvc/vctk/2024-09-16 /home/kamperh/scratch/vctk/wav
./intelligibility.py --format vctk /home/kamperh/scratch/knnvc/vctk/2024-09-16 /home/kamperh/endgame/datasets/VCTK-Corpus/txt/


Results:

               eer
    mean  0.353763
    std   0.079758

    WER: 27.37% +- 0.45%
    CER: 19.18% +- 0.29%

## LinearVC using increasingly more parallel utterances

More than one parallel utterance pair is used.

In [15]:
# exp_tag = "2024-09-29"
exp_tag = "2024-11-11_5.0"
eval_csv = Path("data/speakersim_vctk_english.csv")
wav_dir = Path("/home/kamperh/scratch/vctk/wav")
output_dir = Path(f"/home/kamperh/scratch/linearvc/vctk/{exp_tag}")

k_top = 1

In [16]:
incremental_utts = [
    ["008"],
    # ["002", "008"],
    # ["002", "003", "008"],
    # ["002", "003", "005", "008"],
    # ["002", "003", "005", "006", "008"],
    # ["002", "003", "005", "006", "008", "009"],
    # ["002", "003", "005", "006", "008", "009", "012"],
    # ["002", "003", "005", "006", "008", "009", "012", "013"],
    # ["002", "003", "005", "006", "008", "009", "012", "013", "014"],
    # ["002", "003", "005", "006", "008", "009", "012", "013", "014", "016"],
    # ["002", "003", "005", "006", "008", "009", "012", "013", "014", "016", "018", "020", "024"],
    # ["002", "003", "005", "006", "008", "009", "012", "013", "014", "016", "018", "020", "023", "024"],
]

In [17]:
for parallel_utts in incremental_utts:

    print("-"*80)
    print("Utterances:", parallel_utts)
    cur_output_dir = output_dir / f"{len(parallel_utts):02d}"

    # Projection matrices
    projmats = {}
    n_frames = []
    for source in tqdm(sorted(speakers)):
        for target in tqdm(sorted(speakers), leave=False):
        # for target in sorted(speakers):
            if source == target:
                continue

            combined_source_feats = []
            combined_linear_target = []
            for parallel_utt in parallel_utts:
    
                # Features
                source_wav_fn = wav_dir / source / f"{source}_{parallel_utt}.wav"
                target_wav_fn = wav_dir / target / f"{target}_{parallel_utt}.wav"
                source_wav, _ = torchaudio.load(source_wav_fn)
                source_wav = source_wav.to(device)
                target_wav, _ = torchaudio.load(target_wav_fn)
                target_wav = target_wav.to(device)
                with torch.inference_mode():
                    source_feats, _ = wavlm.extract_features(source_wav, output_layer=6)
                    target_feats, _ = wavlm.extract_features(target_wav, output_layer=6)
                source_feats = source_feats.squeeze()
                target_feats = target_feats.squeeze()

                # Matching
                dists = fast_cosine_dist(source_feats, target_feats, device=device)
                best = dists.topk(k=k_top, largest=False, dim=-1)        
                linear_target = target_feats[best.indices].mean(dim=1)

                combined_source_feats.append(source_feats.cpu().numpy())
                combined_linear_target.append(linear_target.cpu().numpy())

            combined_source_feats = np.vstack(combined_source_feats)
            combined_linear_target = np.vstack(combined_linear_target)

            # W, _, _, _ = linalg.lstsq(combined_source_feats, combined_linear_target)
            
            # linear = celer.Lasso(alpha=0.3, fit_intercept=False).fit(
            linear = celer.Lasso(alpha=5.0, fit_intercept=False).fit(
                combined_source_feats, combined_linear_target
            )
            W = linear.coef_.T
            
            W = torch.from_numpy(W).float().to(device)
            projmats[f"{source}-{target}"] = W
            
        n_frames.append(combined_source_feats.shape[0])

    print(f"Mean no. source frames: {np.mean(n_frames):.2f}")
    print(f"Mean source duration:   {np.mean(n_frames)*0.02:.2f} sec")
    
    cur_output_dir.mkdir(parents=True, exist_ok=True)
    print("Writing to:", cur_output_dir)
    with open(eval_csv) as f:
        for line in tqdm(f.readlines()):
            line = line.strip()
            if line[-1] == "0":
                (source, target, source_key, _, _) = line.split(",")
    
                source_wav_fn = (
                    wav_dir / source / Path(source_key).stem
                ).with_suffix(".wav")
                source_wav, _ = torchaudio.load(source_wav_fn)
                source_wav = source_wav.to(device)
                with torch.inference_mode():
                    source_feats, _ = wavlm.extract_features(
                        source_wav, output_layer=6
                    )
    
                W_source_to_target = projmats[f"{source}-{target}"]
    
                source_to_target_feats = source_feats @ W_source_to_target
    
                with torch.inference_mode():
                    wav_hat = hifigan(source_to_target_feats).squeeze(0)            
    
                cur_cur_output_dir = Path(cur_output_dir) / source_key.split("/")[0]
                cur_cur_output_dir.mkdir(parents=True, exist_ok=True)
                output_fn = (cur_cur_output_dir / source_key.split("/")[1]).with_suffix(
                    ".wav"
                )
                torchaudio.save(output_fn, wav_hat.squeeze().cpu()[None], 16000)
    
                # print(output_fn)
                # assert False

    class Arguments: pass
    args = Arguments()
    args.format = "vctk"
    args.eval_csv = eval_csv
    args.converted_dir = cur_output_dir
    args.groundtruth_dir = wav_dir
    
    # print("Run:")
    print(
        f"./speaker_similarity.py --format {args.format}"
        f" {args.eval_csv} {args.converted_dir} {args.groundtruth_dir}"
    )
    print(
        f"./intelligibility.py --format {args.format} {args.converted_dir}"
        f" /home/kamperh/endgame/datasets/VCTK-Corpus/txt/"
    )
    # speaker_similarity(args)
    
    # break  # temp

print("-"*80)

--------------------------------------------------------------------------------
Utterances: ['008']


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(
  sol = celer(
  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


  0%|          | 0/31 [00:00<?, ?it/s]

  sol = celer(
Fitting data with very small alpha causes precision issues.
  sol = celer(


Mean no. source frames: 348.55
Mean source duration:   6.97 sec
Writing to: /home/kamperh/scratch/linearvc/vctk/2024-11-11_5.0/01


  0%|          | 0/9301 [00:00<?, ?it/s]

./speaker_similarity.py --format vctk data/speakersim_vctk_english.csv /home/kamperh/scratch/linearvc/vctk/2024-11-11_5.0/01 /home/kamperh/scratch/vctk/wav
./intelligibility.py --format vctk /home/kamperh/scratch/linearvc/vctk/2024-11-11_5.0/01 /home/kamperh/endgame/datasets/VCTK-Corpus/txt/
--------------------------------------------------------------------------------


With lasso ($\alpha=0.3$) regularisation:

| No. utterances | Mean frames | Mean duration | WER            | CER            | EER                  |
|----------------|-------------|---------------|----------------|----------------|----------------------|
| 1              | 348.55      | 6.97          | 7.58% +- 0.22% | 6.93% +- 0.12% | 0.314409 +- 0.083072 |
| 2              | 501.55      | 10.03         | 6.61% +- 0.21% | 6.36% +- 0.12% | 0.320430 +- 0.072171 |
| 3              | 836.55      | 16.73         | 5.56% +- 0.18% | 5.81% +- 0.09% | 0.324086 +- 0.071857 |
| 4              | 1133.84     | 22.68         | 5.22% +- 0.17% | 5.56% +- 0.08% | 0.326022 +- 0.071313 |
| 5              | 1386.35     | 27.73         | 5.04% +- 0.18% | 5.50% +- 0.09% | 0.324301 +- 0.070544 |
| 6              | 1606.68     | 32.13         | 4.99% +- 0.17% | 5.47% +- 0.09% | 0.328387 +- 0.073575 |
| 7              | 1809.00     | 36.18         | 4.88% +- 0.16% | 5.42% +- 0.08% | 0.328172 +- 0.075537 |
| 8              | 2008.06     | 40.16         | 4.82% +- 0.17% | 5.37% +- 0.08% | 0.332043 +- 0.075433 |
| 9              | 2227.74     | 44.55         | 4.72% +- 0.17% | 5.31% +- 0.08% | 0.333978 +- 0.076543 |
| 10             | 2530.55     | 50.61         | 4.78% +- 0.16% | 5.33% +- 0.08% | 0.331828 +- 0.074946 |
| 13             | 3290.74     | 65.81         | 4.85% +- 0.18% | 5.40% +- 0.10% | 0.328387 +- 0.081667 |
| 14             | 3812.06     | 76.24         | 4.84% +- 0.16% | 5.35% +- 0.08% | 0.328602 +- 0.081671 |

Without regularisation (except if necessary, then smallest norm is selected):

| No. utterances | Mean frames | Mean duration | WER            | CER            | EER                  |
|----------------|-------------|---------------|----------------|----------------|----------------------|
| 1              | 348.55      | 6.97          | 24.00% +- 0.44%| 16.99% +- 0.27%| 0.341935 +- 0.083908 |
| 2              | 501.55      | 10.03         | 32.70% +- 0.54%| 32.70% +- 0.54%| 0.328172 +- 0.072719 |
| 4              | 1133.84     | 22.68         |100.24% +- 0.84%| 79.70% +- 0.82%| 0.129247 +- 0.058059 |
| 5              | 1386.35     | 27.73         | 68.17% +- 0.85%| 50.22% +- 0.79%| 0.224946 +- 0.065205 |
| 6              | 1606.68     | 32.13         | 40.75% +- 0.84%| 28.98% +- 0.75%| 0.283226 +- 0.067831 |
| 7              | 1809.00     | 36.18         | 25.94% +- 0.40%| 18.58% +- 0.27%| 0.314839 +- 0.067103 |
| 8              | 2008.06     | 40.16         | 19.49% +- 0.45%| 14.09% +- 0.31%| 0.33871  +- 0.07052  |
| 9              | 2227.74     | 44.55         | 14.54% +- 0.28%| 10.93% +- 0.18%| 0.352043 +- 0.066320 |
| 10             | 2530.55     | 50.61         | 11.61% +- 0.28%| 9.22% +- 0.19% | 0.363011 +- 0.066195 |
| 13             | 3290.74     | 65.81         | 8.18% +- 0.25% | 7.23% +- 0.13% | 0.375054 +- 0.066332 |
| 14             | 3812.06     | 76.24         | 7.36% +- 0.22% | 6.76% +- 0.12% | 0.375269 +- 0.064800 |

The reason for the very poor performance starting at 4 is that this is just at
the point where no regularisation is necessary (just more than 1024 frames). So
before this point there is, in fact, regularisation applied which clearly helps
greatly.

With one utterances (`008`) but now varying $\alpha$:

| alpha | WER            | CER            | EER                  |
|-------|----------------|----------------|----------------------|
| 0.3   | 7.58% +- 0.22% | 6.93% +- 0.12% | 0.314409 +- 0.083072 |
| 0.5   | 6.88% +- 0.22% | 6.47% +- 0.11% | 0.292258 +- 0.084837 |
| 1.0   | 6.62% +- 0.20% | 6.33% +- 0.10% | 0.244731 +- 0.082286 |
| 2.0   | 7.46% +- 0.21% | 6.81% +- 0.12% | 0.182151 +- 0.079375 |
| 5.0   | 9.12% +- 0.24% | 7.79% +- 0.14% | 0.101505 +- 0.058009 |