In [1]:
from IPython.display import Audio
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
from pathlib import Path
from functools import partial, reduce
from itertools import chain, product, tee, starmap
from pysptk.synthesis import AllPoleDF, Synthesizer
import pysptk
from pyloudnorm import Meter
import pyloudnorm as pyln
from librosa.util import frame

np.random.seed(114514)
# from diffsptk import Frame, LPC, AllPoleDigitalFilter, ExcitationGeneration

In [9]:
test_numbers = [3, 4, 5, 6, 7, 8, 9, 11, 19, 24]
test_male = "p360"
test_female = "p361"
male_numbers = test_numbers[::2]
female_numbers = test_numbers[1::2]
paired_subject_numbers = list(
    chain(
        zip([test_male] * len(male_numbers), male_numbers),
        zip([test_female] * len(female_numbers), female_numbers),
    )
)

sr = 24000
target_loudness = -26.0
root_dir = "/home/ycy/data-disk/Datasets/VCTK-Corpus-0.92-raw/24k-mic1/"
out_dir = "interspeech_listening_test/"

meter = Meter(sr)
normaliser = lambda x: pyln.normalize.loudness(
    x, meter.integrated_loudness(x), target_loudness
)

hop_length = sr // 100
frame_length = 1024
lpc_order = 22

## Get normalised ground truth

In [10]:
gt_dir = "/home/ycy/data-disk/Datasets/VCTK-Corpus-0.92-raw/24k-mic1/"


test_f0s = list(
    starmap(
        lambda subject, num: np.loadtxt(
            Path(gt_dir) / subject / f"{subject}_{num:03d}_mic1.pv"
        )[::2],
        paired_subject_numbers,
    )
)

In [11]:
gt_audio = list(
    starmap(
        lambda subject, num: sf.read(Path(root_dir) / subject / f"{subject}_{num:03d}_mic1.wav")[0],
        paired_subject_numbers,
    )
)
normalised_audio = map(normaliser, gt_audio)
num2out_filename = (
    lambda method, subject, num: Path(out_dir) / f"{method}_{subject}_{num:03d}.wav"
)

for out_filename, audio in zip(
    starmap(partial(num2out_filename, "gt"), paired_subject_numbers),
    normalised_audio,
):
    assert np.max(np.abs(audio)) <= 1.0, (np.max(audio), np.min(audio), out_filename)
    sf.write(out_filename, audio, sr)

## SPTK LPC baseline (low anchor)

In [12]:
synthesizer = Synthesizer(AllPoleDF(order=lpc_order), hop_length)
window = pysptk.blackman(frame_length)

def lpc_analysis(audio):
    padded = np.pad(audio, (frame_length // 2,) * 2, mode="reflect")
    lpc = pysptk.lpc(frame(padded, frame_length=frame_length, hop_length=hop_length).T * window, lpc_order)
    lpc[:, 0] = np.log(lpc[:, 0])
    return lpc

def lpc_synth(pitch, lpc):
    ex = pysptk.excite(pitch, hop_length, gaussian=True)
    return synthesizer.synthesis(ex, lpc)

In [13]:
f02pitch = lambda f0: np.where(f0 > 0, sr / np.maximum(f0, 1), 0)
lpc_recon_normalised = starmap(
    lambda pitch, audio: normaliser(lpc_synth(pitch, lpc_analysis(audio))),
    zip(
        map(f02pitch, test_f0s),
        gt_audio,
    ),
)

for out_filename, audio in zip(
    starmap(partial(num2out_filename, "pysptk"), paired_subject_numbers),
    lpc_recon_normalised,
):
    assert np.max(np.abs(audio)) <= 1.0, (np.max(audio), np.min(audio), out_filename)
    sf.write(out_filename, audio, sr)

## Trained models

In [14]:
methods = ["golf", "world", "nhv"]
recon_root_dir = "/home/ycy/data-disk/vctk-ae-pred/"

In [15]:
recon_audio = starmap(
    lambda method, subject, num: sf.read(
        Path(recon_root_dir) / method / subject / f"{subject}_{num:03d}_mic1.wav"
    )[0],
    starmap(
        lambda x, _: (x, *_),
        product(methods, paired_subject_numbers),
    ),
)
normalised_recon_audio = map(normaliser, recon_audio)

for out_filename, audio in zip(
    starmap(
        num2out_filename,
        starmap(lambda x, _: (x, *_), product(methods, paired_subject_numbers)),
    ),
    normalised_recon_audio,
):
    assert np.max(np.abs(audio)) <= 1.0, (np.max(audio), np.min(audio), out_filename)
    sf.write(out_filename, audio, sr)