In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
from scipy.stats import norm

In [None]:
from src.datamodules.components.audio_dataset import AudioDataset

In [None]:
dataset = AudioDataset('../data/preprocessed/cello/features.pth')

In [None]:
loudness = torch.cat([l['loudness'][0] for l in dataset.features]).numpy()

In [None]:
plt.rcParams['figure.figsize'] = [16, 8]

n, bins, patches = plt.hist(loudness, 128)
plt.title("Loudness Histogram")
plt.xlabel("Db")
plt.ylabel("Frequency")

l_min = loudness.min()
l_max = loudness.max()
mean = loudness.mean()
std = loudness.std()
start = mean - std
end = mean + std

plt.xticks([-70.0, -65.0, -60.0, -30.0, mean, l_min, l_max, start, end, start-std, end+std])
plt.grid(axis='x')

plt.axvline(x=l_min, linewidth=2, label=f'min={l_min:.2f}', color='k')
plt.axvline(x=l_max, linewidth=2, label=f'max={l_max:.2f}', color='k')
plt.axvline(x=mean, linewidth=2, label=f'mean={mean:.2f}', color='k', linestyle='dashed')
plt.axvline(x=start, linewidth=2, label=f'-sigma={start:.2f}', color='g', linestyle='dashed')
plt.axvline(x=end, linewidth=2, label=f'+sigma={end:.2f}', color='g', linestyle='dashed')
plt.axvline(x=start-std, linewidth=2, label=f'-2sigma={start-std:.2f}', color='y', linestyle='dashed')
plt.axvline(x=end+std, linewidth=2, label=f'+2sigma={end+std:.2f}', color='y', linestyle='dashed')

plt.legend(loc='upper left')
plt.show()

In [None]:


pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu102

n, bins, patches = plt.hist(norm.cdf((loudness - mean) / std), 128)
plt.title("Loudness Histogram")
plt.xlabel("Db")
plt.ylabel("Frequency")

In [None]:
f0 = torch.cat([l['f0'][0] for l in dataset.features])

In [None]:
def bins_to_cents(bins):
    """Converts pitch bins to cents"""
    cents = 20 * bins + 1997.3794084376191

    # Trade quantization error for noise
    return cents

def cents_to_frequency(cents):
    """Converts cents to frequency in Hz"""
    return 10 * 2 ** (cents / 1200)

def freqs_to_cents(freq):
    return 1200 * torch.log2(freq / 10.)

def cents_to_bins(cents):
    return (cents - 1997.3794084376191) / 20

In [None]:
f0 = cents_to_bins(freqs_to_cents(f0)) / 359

In [None]:
f0 = f0.numpy()

In [None]:
plt.rcParams['figure.figsize'] = [16, 8]

n, bins, patches = plt.hist(f0, 360)
plt.title("F0 Histogram")
plt.xlabel("Db")
plt.ylabel("Frequency")

l_min = f0.min()
l_max = f0.max()
mean = f0.mean()
std = f0.std()
start = mean - std
end = mean + std

plt.xticks([mean, l_min, l_max, start, end, start-std, end+std])
plt.grid(axis='x')

plt.axvline(x=l_min, linewidth=2, label=f'min={l_min:.2f}', color='k')
plt.axvline(x=l_max, linewidth=2, label=f'max={l_max:.2f}', color='k')
plt.axvline(x=mean, linewidth=2, label=f'mean={mean:.2f}', color='k', linestyle='dashed')
plt.axvline(x=start, linewidth=2, label=f'-sigma={start:.2f}', color='g', linestyle='dashed')
plt.axvline(x=end, linewidth=2, label=f'+sigma={end:.2f}', color='g', linestyle='dashed')
plt.axvline(x=start-std, linewidth=2, label=f'-2sigma={start-std:.2f}', color='y', linestyle='dashed')
plt.axvline(x=end+std, linewidth=2, label=f'+2sigma={end+std:.2f}', color='y', linestyle='dashed')

plt.legend(loc='upper left')
plt.show()

In [None]:
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
import math

In [None]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=251):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ninp=512, nhead=2, nhid=200, nlayers=2, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.ninp = ninp

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        return output

In [None]:
shit = TransformerModel()

In [None]:
bok = torch.randn(5, 10, 512)

In [None]:
with torch.no_grad():
    out = shit(bok)

In [None]:
out.shape

In [None]:
import torchaudio.functional as AF
from einops import rearrange

In [None]:
from src.utils.crepe_loss import CrepeLoss

In [None]:
from torch import Tensor

In [None]:
def loudness(waveform: Tensor, sample_rate: int):
    if waveform.size(-2) > 5:
        raise ValueError("Only up to 5 channels are supported.")

    gate_duration = 0.4
    overlap = 0.75
    gamma_abs = -70.0
    kweight_bias = -0.691
    gate_samples = int(round(gate_duration * sample_rate))
    step = int(round(gate_samples * (1 - overlap)))

    # Apply K-weighting
    waveform = treble_biquad(waveform, sample_rate, 4.0, 1500.0, 1 / math.sqrt(2))
    waveform = highpass_biquad(waveform, sample_rate, 38.0, 0.5)

    # Compute the energy for each block
    energy = torch.square(waveform).unfold(-1, gate_samples, step)
    energy = torch.mean(energy, dim=-1)

    # Compute channel-weighted summation
    g = torch.tensor([1.0, 1.0, 1.0, 1.41, 1.41], dtype=waveform.dtype, device=waveform.device)
    g = g[: energy.size(-2)]

    energy_weighted = torch.sum(g.unsqueeze(-1) * energy, dim=-2)
    loudness = -0.691 + 10 * torch.log10(energy_weighted)

    # Apply absolute gating of the blocks
    gated_blocks = loudness > gamma_abs
    gated_blocks = gated_blocks.unsqueeze(-2)

    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
    gamma_rel = kweight_bias + 10 * torch.log10(energy_weighted) - 10

    # Apply relative gating of the blocks
    gated_blocks = torch.logical_and(gated_blocks.squeeze(-2), loudness > gamma_rel.unsqueeze(-1))
    gated_blocks = gated_blocks.unsqueeze(-2)

    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
    LKFS = kweight_bias + 10 * torch.log10(energy_weighted)
    return LKFS

In [None]:
f, amp, audio = dataset[0]

In [None]:
audio.shape

In [None]:
audio = audio.unsqueeze(0)

In [None]:
def get_amp(example):
    example = rearrange(example, "b c t -> (b c) t")
    example = torch.nn.functional.pad(example, (19200 // 2, 19200 // 2))
    example = example.unfold(1, 19200, 3 * 256)
    example = rearrange(example, "b c t -> (b c) t").unsqueeze(1)

    amp = AF.loudness(example, 48000)

    return amp

In [None]:
shit = get_amp(audio)

In [None]:
plt.plot(shit)

In [None]:
audio = AF.resample(audio, 48000, 16000).unsqueeze(0)

In [None]:
audio = rearrange(audio, "b c t -> (b c) t")

In [None]:
audio = torch.nn.functional.pad(audio, (19200 // 2, 19200 // 2)).unfold(1, 19200, 3 * 256)

In [None]:
audio = rearrange(audio, "bc f t -> (bc f) t")

In [None]:
f.shape, amp.shape, audio.shape

In [None]:
crap = CrepeLoss()

In [None]:
crap.crepe.cuda()
pass

In [None]:
audio = audio.unsqueeze(0)

In [None]:
shit = crap.loss(audio.cuda(), audio.cuda())

In [None]:
shit