In [None]:
import sys
from pathlib import Path

try:
    sys.path.index(str(Path.cwd().parent / 'src'))
except ValueError:
    sys.path.insert(0, str(Path.cwd().parent / 'src'))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 10]
from IPython.display import Audio

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import librosa
from librosa.display import specshow
from librosa.filters import get_window

In [None]:
from models.modules.harmonic_oscillator import OscillatorBank

In [None]:
n_harmonics = 60
sample_rate = 16000
win_length = 1024
hop_length = 64
f0 = 110.
dur = 500

In [None]:
osc = OscillatorBank(n_harmonics, sample_rate, hop_length)

In [None]:
with torch.no_grad():
    dist = torch.abs(torch.randn(1, 1, n_harmonics))
    dist = torch.tile(dist, (1, dur, 1))
    
    # dist = 61. - torch.arange(1, 61)
    
    # dist = torch.ones(1, dur, 60)
    
    dist /= dist.sum(-1, keepdim=True)
    amp = 0.9
    freq = (torch.sin(torch.linspace(0, hop_length * dur / sample_rate, dur) * 3.14159265 * 0.5).unsqueeze(0).unsqueeze(-1) + 2) * f0 / 3
    freq = torch.tile(freq, (2, 1, 1))
    
    # freq = torch.ones(2, dur, 1) * f0
    audio = osc(
        freq,
        torch.ones(1, dur, 1) * amp,
        dist
    ).unsqueeze(-1)

In [None]:
audio.shape

In [None]:
Audio(data=audio[0].T, rate=sample_rate, normalize=False)

In [None]:
stft = np.abs(librosa.stft(audio[0, ..., 0].numpy(), win_length, hop_length))

In [None]:
specshow(librosa.amplitude_to_db(stft), sr=sample_rate, hop_length=hop_length)
plt.show()

- frequency is in cycles per sample
- nyquist of win_length divided by nyquist of sample_rate is the frequency conversion factor
- f0 * this_factor is the frequency term in fbsp kernel
- given crepe pitch, learn inharmonicity factor by maximizing real sound's total energy in this new transform

In [None]:
def pad_audio(x, win_length, hop_length, strict=True):
    # x.shape = [batch, channel, dummy, time]
    # This pads audio so that the middle of the fft windows is on the middle of audio frames.
    length = x.shape[-1]
    if length % hop_length != 0:
        if strict:
            raise ValueError('In strict mode, audio length must be a multiple of hop length')
        else:
            padding_right = hop_length - length % hop_length
            x = F.pad(x, (0, padding_right))
    
    padding = (win_length - hop_length) // 2
    x = F.pad(x, (padding, padding))
    
    return x

In [None]:
def pad_audio_basic(x, win_length, hop_length, strict=True):
    # x.shape = [batch, channel, dummy, time]
    # This pads audio so that the middle of the first fft window is on the beginning of the audio.
    length = x.shape[-1]
    if length % hop_length != 0:
        if strict:
            raise ValueError('In strict mode, audio length must be a multiple of hop length')
        else:
            padding_right = hop_length - length % hop_length
            x = F.pad(x, (0, padding_right))
    
    padding_left = win_length // 2
    padding_right = win_length // 2 - hop_length
    x = F.pad(x, (padding_left, padding_right))
    
    return x

In [None]:
def generate_fbsp_matrix(f0: torch.Tensor, n_harmonics: int, win_length: int, sample_rate: int, fb: float = 1, m: int = 1):
    # f0.shape = [batch, time, channel]
    # matrix.shape = [batch, time, n, k]
    # to avoid looping over all time-steps, we'll do all calculations without cutting off frequencies above nyquist.
    # we'll zero them out during the actual transformation
    n = torch.arange(win_length, dtype=torch.float32)
    k = torch.arange(1, n_harmonics + 1, dtype=torch.float32)
    fc = torch.einsum('ijk,k->ijk', f0, k) / sample_rate
    
    sqrt_fb = torch.sqrt(torch.tensor(fb, dtype=torch.float32))
    order_m = (n * fb / m) ** m
    fc_n = torch.einsum('ijk,l->ijkl', fc, n)
    exp = torch.exp(2j * np.pi * fc_n)
    result = sqrt_fb * torch.einsum('l,ijkl->ijkl', order_m, exp)
    
    return result

In [None]:
fbsp = generate_fbsp_matrix(freq, n_harmonics, win_length, sample_rate)

In [None]:
# reshape to [batch, channel, dummy, time for padding and framing
_audio = audio.permute(0, 2, 1).unsqueeze(-2)
padded_audio = pad_audio(_audio, win_length, hop_length)
audio_frames = F.unfold(padded_audio, (1, win_length), stride=(1, hop_length))
hann = torch.hann_window(win_length)
windowed_frames = torch.einsum('bnt,n->bnt', audio_frames, hann)

In [None]:
audio_frames.shape, fbsp.shape

In [None]:
transformed = torch.einsum('bnt,bthn->bth', windowed_frames.type(torch.complex64), fbsp)

In [None]:
new_dist = torch.abs(transformed) / torch.sum(torch.abs(transformed), dim=-1, keepdims=True)

In [None]:
plt.imshow(new_dist[0].flip(1).T)

In [None]:
with torch.no_grad():
    # freq = torch.ones(2, dur, 1) * f0
    new_audio = osc(
        freq,
        torch.ones(1, dur, 1) * amp,
        new_dist
    ).unsqueeze(-1)

In [None]:
Audio(data=new_audio[0].T, rate=16000, normalize=False)

In [None]:
def dft_kernel(n, k):
    return torch.exp(torch.tensor(-2j * np.pi * (k / win_length) * n, dtype=torch.complex64))

In [None]:
def selective_dft_kernel(n, k, f0=f0):
    k += 1
    return torch.exp(torch.tensor(-2j * np.pi * (f0 * k / sample_rate) * n, dtype=torch.complex64))

In [None]:
def fbsp_kernel(n, k, f0=f0, fb=1, m=1):
    k += 1
    return torch.sqrt(torch.tensor(fb, dtype=torch.float32)) * torch.tensor((fb * n / m) ** m, dtype=torch.float32) * torch.exp(torch.tensor(2j * np.pi * (f0 * k / sample_rate) * n, dtype=torch.complex64))

In [None]:
overtones = f0 * np.arange(1, n_harmonics + 1)
overtones = overtones[overtones < sample_rate / 2]
n_overtones = len(overtones)

In [None]:
kernel = np.fromfunction(dft_kernel, (win_length, win_length))
fkernel = np.fromfunction(fbsp_kernel, (win_length, n_overtones))
skernel = np.fromfunction(selective_dft_kernel, (win_length, n_overtones))

In [None]:
tr = (0j + audio[:, win_length*8:win_length*9, 0]) @ fkernel

In [None]:
audio[:, win_length*8:win_length*9, 0].shape, fkernel.shape, tr.shape

In [None]:
trein = torch.einsum('bi,ij->bj', (0j + audio[:, win_length*8:win_length*9, 0]), fkernel)

In [None]:
padded_audio = F.pad(audio, (win_length // 2, win_length // 2))

In [None]:
hann = torch.hann_window(win_length)

In [None]:
atr = torch.abs(tr[0])
bins = atr / torch.sum(atr)
diff = dist[0, 0] - bins
diff.mean(), diff.std(), bins.min(), bins.max(), dist.min(), dist.max()

In [None]:
pad_audio[:, win_length*i:win_length*(i+1)].shape

In [None]:
pad_audio.shape

In [None]:
win_length*i

In [None]:
win_length*(i+1) - win_length*i

In [None]:
pad_audio.shape[1] - audio.shape[1]

In [None]:
i

In [None]:
def generate_fbsp_matrix(f0: torch.Tensor, n_harmonics: int, win_length: int, sample_rate: int, fb: int = 1, m: int = 1):
    # f0.shape = [batch, time, value]
    # matrix.shape = [batch, time, n, k]
    # to avoid looping over all time-steps, we'll do all calculations without cutting off frequencies above nyquist.
    # we'll zero them out during the actual transformation
    n = torch.arange(win_length, dtype=torch.float32)
    k = torch.arange(1, n_harmonics + 1, dtype=torch.float32)
    fc = torch.einsum('ijk,k->ijk', f0, k)
    left = torch.sqrt(torch.tensor(fb, dtype=torch.float32)) * (n * fb / m) ** m
    right = torch.exp(2j * np.pi * torch.einsum('ijk,l->ijlk', fc, n))
    print(left.shape, right.shape)
    return torch.einsum('k,ijkl->ijkl', left, right)

In [None]:
f0 = torch.ones(1, 10, 1) * 110.
generate_fbsp_matrix(f0, n_harmonics, win_length, sample_rate).shape

In [None]:
n = torch.arange(win_length)
k = torch.arange(1, n_harmonics + 1)
nk = torch.stack(torch.meshgrid(n, k, indexing='ij'), dim=-1)

In [None]:
n.shape, k.shape, nk.shape

In [None]:
_nk = nk.unsqueeze(0).unsqueeze(0)

In [None]:
_nk.shape, f0.shape

In [None]:
torch.einsum('abijk,abk->abijk', nk.unsqueeze(0).unsqueeze(0)[..., 1:], f0).shape

In [None]:
f0 = (torch.sin(torch.linspace(0, hop_length * dur / sample_rate, dur) * 3.14159265 * 0.5).unsqueeze(0).unsqueeze(-1) + 1) * 110.
n = torch.arange(win_length)
k = torch.arange(1, n_harmonics + 1)
# fc = f0 * k.view(1, 1, -1)

In [None]:
base_matrix = torch.cartesian_prod(n, k)

In [None]:
base_matrix.shape

In [None]:
f0.shape

```python
def smooth(x, filter_size=3):
    """Smooth 1-d signal with a box filter."""
    x = tf.convert_to_tensor(x, tf.float32)
    is_2d = len(x.shape) == 2
    x = x[:, :, tf.newaxis] if is_2d else x[tf.newaxis, :, tf.newaxis]
    w = tf.ones([filter_size])[:, tf.newaxis, tf.newaxis] / float(filter_size)
    y = tf.nn.conv1d(x, w, stride=1, padding='SAME')
    y = y[:, :, 0] if is_2d else y[0, :, 0]
    return y.numpy()
```