In [None]:
import time
from collections import defaultdict
from pathlib import Path
from statistics import mean

import numpy as np
import pandas as pd
import pyloudnorm as pyln
import torch
from scipy.io import wavfile
from scipy.signal import freqz
from torch import Tensor
from tqdm.notebook import tqdm

from s4drc.src.dataset import SignalTrainDatasetModule
from s4drc.src.loss import FrechetAudioDistance, forge_validation_criterions_by
from s4drc.src.model import S4Model
from s4drc.src.module.db import convert_to_decibel

# Model Evaluation

This Jupyter Notebook contains routine to evaluate the model.

Each individual evaluation task is wrapped in a function to prevent variables going global,
All functions will save results to the local file system.

## Preparatory Work

In [None]:
ROOT_DIR = Path('./pl-experiment-result/')

all_model_dirs = []
for run_dir in sorted(ROOT_DIR.iterdir()):
    if not run_dir.is_dir():
        continue
    with open(run_dir / 'checkpoints' / 'best-ckpt.txt', 'r') as f:
        best_model_name = f.read().strip()
    model_dir = run_dir / 'checkpoints' / best_model_name
    assert model_dir.exists() and model_dir.is_file()
    all_model_dirs.append(model_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def get_model(ckpt_dir: Path):
    return S4Model.load_from_checkpoint(ckpt_dir, map_location=device).eval()

def get_testing_dataloader(ckpt_dir: Path, batch_size: int, testing_segment_length: int):
    data_module = SignalTrainDatasetModule.load_from_checkpoint(ckpt_dir, map_location=device, batch_size=batch_size, testing_segment_length=testing_segment_length)
    data_module.prepare_data()
    return data_module.test_dataloader()

## Loss Value

All loss value are calculated using 2 ** 23

In [None]:
@torch.no_grad()
def test():
    batch_size = 1
    testing_dataloader = get_testing_dataloader(all_model_dirs[0], batch_size, 2 ** 23)  # match Steinmetz and Reiss

    for model_dir in all_model_dirs:
        model = get_model(model_dir).to(device)
        validation_losses = defaultdict(float)

        validation_criterions = forge_validation_criterions_by(model.hparams['loss_filter_coef']).to(device)
        lufs_meter = pyln.Meter(SignalTrainDatasetModule.sample_rate)
        fad = FrechetAudioDistance()

        # The following saving tensors for 
        all_y = np.empty([len(testing_dataloader), 2 ** 23])
        all_y_hat = np.empty([len(testing_dataloader), 2 ** 23])

        for i, (x, y, parameters) in tqdm(
            enumerate(testing_dataloader),
            desc=f'Testing {model_dir}.',
            total=len(testing_dataloader)
        ):
            x: Tensor = x.to(device)
            y: Tensor = y.to(device)
            parameters: Tensor = parameters.to(device)

            y_hat: Tensor = model(x, parameters)

            all_y[i, :] = y.flatten().cpu().numpy()
            all_y_hat[i, :] = y_hat.flatten().cpu().numpy()

            for validation_loss, validation_criterion in validation_criterions.items():
                loss: Tensor = validation_criterion(y_hat.unsqueeze(1), y.unsqueeze(1))
                validation_losses[validation_loss] += loss.item()

            validation_losses['LUFS'] += np.abs(
                lufs_meter.integrated_loudness(y.flatten().cpu().numpy()) -
                lufs_meter.integrated_loudness(y_hat.flatten().cpu().numpy())
            )
        
        for k, v in list(validation_losses.items()):
            validation_losses[k] = v / len(testing_dataloader)
        
        # FAD should not be taken mean values
        validation_losses['FAD'] = fad.score(all_y, all_y_hat)

        validation_losses = {
            k: [v] for k, v in validation_losses.items()
        }
    
        pd.DataFrame(validation_losses).to_csv(model_dir.parent.parent / f'loss.csv')

test()

## Evaluate Direct Inference Efficiency (CUDA and CPU)

In [None]:
@torch.no_grad()
def evaluate_inference_efficiency():
    for model_dir in all_model_dirs:
        print(f'Calculating {model_dir} model inference efficiency.')
        model = get_model(model_dir)
        sample_lengths: list[int] = [2 ** i for i in range(5, 20)]

        local_devices = [torch.device('cpu')]
        if torch.cuda.is_available():
            local_devices.append(torch.device('cuda'))

        real_time_ratio_dict = defaultdict(list)
        for local_device in local_devices:
            if local_device.type == 'cpu':
                print(f'Doing inference speed test on {(device_name := local_device.type).upper()}...')
            elif local_device.type == 'cuda':
                print(f'Doing inference speed test on {(device_name := torch.cuda.get_device_name())}.')
            else:
                raise NotImplementedError(f'Inference efficiency test can only run on CPU/CUDA')

            model = model.to(local_device)

            real_time_ratio_dict['device-name'].append(device_name)
            
            speed_ratios = []

            for dataset_sample_length in sample_lengths:
                dataset_sample_time_ns = dataset_sample_length * 1e9 / SignalTrainDatasetModule.sample_rate

                inference_time_ns: list[int] = []
                for _ in range(20):
                    x = torch.rand(1, dataset_sample_length).to(local_device, torch.float32)
                    cond = torch.tensor([[1, 65]]).to(local_device, torch.float32)

                    tic = time.perf_counter_ns()
                    model(x, cond)
                    toc = time.perf_counter_ns()
                    inference_time_ns.append(toc - tic)
                
                inference_time_ns_mean = mean(inference_time_ns)
                real_time_ratio = dataset_sample_time_ns / inference_time_ns_mean
                speed_ratios.append(real_time_ratio)
                real_time_ratio_dict[str(dataset_sample_length)].append(real_time_ratio)
        
        pd.DataFrame(real_time_ratio_dict).to_csv(model_dir.parent.parent / 'efficiency.csv')


evaluate_inference_efficiency()

## Generate Output Audio (CUDA)

In [None]:
@torch.no_grad()
def evaluate_output_audio():
    testing_dataloader = get_testing_dataloader(all_model_dirs[0], 16, 10 * SignalTrainDatasetModule.sample_rate)

    for model_dir in all_model_dirs:
        print(f'Generating {model_dir} model output audio.')
        model = get_model(model_dir).to(device)

        output_audio_dir = model_dir.parent.parent / f'output-audio-10s'
        output_audio_dir.mkdir(exist_ok=True)

        ii = 0
        for x, y, cond in tqdm(testing_dataloader, desc='Generating output audio', total=len(testing_dataloader)):
            x: Tensor = x.to(device)
            y: Tensor = y.to(device)
            cond: Tensor = cond.to(device)
            
            y_hat: Tensor = model(y, cond)
            
            for i in range(y_hat.size(0)):
                switch, peak_reduction = cond[i, :].flatten().cpu().tolist()
                prefix = f'{str(ii).zfill(3)}-switch={switch}-peak-reduction={peak_reduction}'

                x_audio = x[i, :].flatten().cpu().numpy()
                y_audio = y[i, :].flatten().cpu().numpy()
                y_hat_audio = y_hat[i, :].flatten().cpu().numpy()
                y_diff_audio = y_audio - y_hat_audio

                wavfile.write(output_audio_dir / f'{prefix}-x.wav', SignalTrainDatasetModule.sample_rate, x_audio)
                wavfile.write(output_audio_dir / f'{prefix}-y.wav', SignalTrainDatasetModule.sample_rate, y_audio)
                wavfile.write(output_audio_dir / f'{prefix}-y-hat.wav', SignalTrainDatasetModule.sample_rate, y_hat_audio)
                wavfile.write(output_audio_dir / f'{prefix}-y-diff.wav', SignalTrainDatasetModule.sample_rate, y_diff_audio)

                ii += 1

evaluate_output_audio()

Real-time