In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
plt.rcParams['figure.figsize'] = [17, 8]

In [2]:
import torch
from torch.utils.data import DataLoader

In [3]:
from dataset.audio_dataset import AudioData, default
from model.autoencoder.encoder import Encoder
from model.autoencoder.decoder import Decoder
from model.ddsp.harmonic_oscillator import OscillatorBank
from train.train import AutoEncoder
from config.default import Config

In [4]:
default = Config()

In [5]:
dataset = AudioData(clear=False)

Loading presaved dataset...


In [6]:
audio = dataset[0]
audio.shape

torch.Size([88064])

In [7]:
loader = DataLoader(dataset, 10, shuffle=True)

In [8]:
for batch in loader:
    print(batch.shape)
    break

torch.Size([10, 88064])


In [9]:
batch = batch

In [None]:
librosa.display.waveplot(batch[0].cpu().numpy(), sr=default.sample_rate)

In [None]:
ipd.Audio(data=batch[0].cpu().numpy(), rate=default.sample_rate)

In [None]:
encoder = Encoder()

In [None]:
z = encoder(batch)

In [None]:
z['probabilities'].shape

In [None]:
plt.plot(z['normalized_cents'][0].cpu().numpy())
plt.plot(z['harmonicity'][0].cpu().numpy())
plt.plot(z['loudness'][0].cpu().numpy())

In [None]:
librosa.display.specshow(z['probabilities'][0].cpu().numpy().T, sr=default.sample_rate, hop_length=default.hop_length)

In [None]:
ae = AutoEncoder()

In [None]:
with torch.no_grad():
    result_hat = ae(batch)

In [None]:
decoder = Decoder()

In [None]:
with torch.no_grad():
    z_hat = decoder(z)

In [None]:
with torch.no_grad():
    z = encoder(result_hat)

In [None]:
plt.plot(z['normalized_cents'][0].cpu().numpy())
plt.plot(z['harmonicity'][0].cpu().numpy())
plt.plot(z['loudness'][0].cpu().numpy())

In [None]:
ipd.Audio(data=result_hat[0].cpu().numpy(), rate=default.sample_rate)

In [None]:
from crepe.hidden_markov_model import HiddenMarkovModel

In [None]:
# uniform prior on the starting pitch
starting = torch.ones(360) / 360

# transition probabilities inducing continuous pitch
xx, yy = torch.meshgrid(torch.arange(360), torch.arange(360))
transition = torch.maximum(12 - abs(xx - yy), torch.zeros_like(xx))
transition = transition / torch.sum(transition, dim=1, keepdims=True)

# emission probability = fixed probability for self, evenly distribute the
# others
self_emission = 0.1
emission = (torch.eye(360) * self_emission + torch.ones((360, 360)) *
            ((1 - self_emission) / 360))

In [None]:
model = HiddenMarkovModel(transition, emission, starting)

In [None]:
states_seq, state_prob = model.viterbi_inference(z['probabilities'][0].argmax(dim=-1))

In [None]:
plt.plot(z['normalized_cents'][0].cpu().numpy())
plt.plot(states_seq.cpu().numpy() / 359)

In [None]:
states_seq.shape