In [1]:
import os
import glob

# Found file corresponding to this W&B run with: `grep "3pzwny4n" outputs/*/args.json`
model_folder = '../outputs/crepe-20211129-122548'

model_paths = glob.glob(os.path.join(model_folder, '*'))

In [2]:
import sys
sys.path.append('/home/jxm3/research/transcription/contrastive-pitch-detection')

In [3]:
from models.bytedance import Bytedance_Regress_pedal_Notes
from models.contrastive import ContrastiveModel

min_midi = 21
max_midi = 108
def get_model():
    num_output_nodes = 256 # contrastive embedding dim
    out_activation = None
    
    model = Bytedance_Regress_pedal_Notes(
        num_output_nodes, out_activation, tiny=False
    )
    
    return ContrastiveModel(model, min_midi, max_midi, num_output_nodes)

In [4]:
import glob
import natsort
import os
import torch

model_paths = glob.glob(os.path.join(model_folder, '*'))

model_path = natsort.natsorted(model_paths)[-2]
print('loaded model from:', model_path)

model = get_model()
model.load_state_dict(torch.load(model_path)['model'])

loaded model from: ../outputs/crepe-20211129-122548/84_epochs.pth


<All keys matched successfully>

In [5]:
# from dataloader.nsynth import load_nsynth
# dataset = load_nsynth('test', 'keyboard')

from dataloader.nsynth_chords import load_nsynth_chords
dataset = load_nsynth_chords('test')

print('loaded', len(dataset), 'tracks')

import random
random.shuffle(dataset)

loaded 993 tracks


In [6]:
import numpy as np
from utils.misc import midi_vals_to_categorical, hz_to_midi_v

batch_size = 256

min_midi = 21
max_midi = 108

x = []
y = []
all_midis = []

for i in range(batch_size):
    track = dataset[i]
    start_idx = 0
    end_idx = 16_000
    #
    audio = torch.tensor(track.waveform[start_idx : end_idx], dtype=torch.float32)
    x.append(audio)
    #
    frequencies = track.get_frequencies_from_offset(start_idx, end_idx)
    midis = np.rint(hz_to_midi_v(frequencies))
    all_midis.append(list(midis))
    categorical = midi_vals_to_categorical(midis, min_midi, max_midi)
    y.append(torch.tensor(categorical, dtype=torch.float32))
x = torch.stack(x)
y = torch.stack(y)
print('loaded audio batch of shape:', x.shape, 'with labels', y.shape)

loaded audio batch of shape: torch.Size([256, 16000]) with labels torch.Size([256, 88])


In [6]:
import seaborn as sns
sns.histplot(y.sum(1)) # validation set chord distribution (by num. notes)

NameError: name 'y' is not defined

In [None]:
from IPython.display import Audio
Audio(data=x[0], rate=16000)

In [11]:
audio_embeddings = model(x)
note_labels = y

batch_size, num_notes = note_labels.shape
assert num_notes == model.num_labels
chord_embeddings = model.encode_note_labels(note_labels)
assert chord_embeddings.shape == audio_embeddings.shape
# Normalize embeddings and compute logits.
normalized_audio_embeddings = audio_embeddings / torch.norm(audio_embeddings, p=2, dim=1, keepdim=True)
normalized_chord_embeddings = chord_embeddings / torch.norm(chord_embeddings, p=2, dim=1, keepdim=True)
unscaled_audio_to_chord_sim = torch.matmul(normalized_audio_embeddings, normalized_chord_embeddings.T)
audio_to_chord_sim = unscaled_audio_to_chord_sim * torch.exp(model.temperature)
chord_to_audio_sim = audio_to_chord_sim.T

logits = unscaled_audio_to_chord_sim

# Compute labels when there may be duplicates.
labels = (note_labels[:,None] == note_labels).all(2).type(torch.float32)
labels = labels / labels.sum(1)
# Compute loss across both axes.
loss_a = torch.nn.functional.cross_entropy(audio_to_chord_sim, labels)
loss_n = torch.nn.functional.cross_entropy(chord_to_audio_sim, labels.T)
loss = (loss_a + loss_n)/2
loss

tensor(2.1116, grad_fn=<DivBackward0>)

# Investigating train/val difference

Why is there such a big difference between the train and validation loss? Clearly, the heatmap above is pretty bad. But what does it look like for the training data?

In [7]:
from generator import AudioDataGenerator
g = AudioDataGenerator(
        [], 16000, float('inf'),
        randomize_train_frame_offsets=True,
        batch_size=256,
        augmenter=None,
        normalize_audio=False,
        label_format='categorical',
        min_midi=21, max_midi=108,
        sample_rate=16000,
        batch_by_track=False,
        num_fake_nsynth_chords=1000,
    )

Replacing 0 tracks with 1000 fake NSynth chords
--> MusicDataLoader loading dataset nsynth_keyboard_train


Resampling tracks: 100%|██████████| 51821/51821 [00:00<00:00, 1481928.88it/s]


TrackFrameSampler loaded 4000 frames


In [8]:
x_train, y_train = g[0]

In [None]:
audio_embeddings = model(x_train)
note_labels = y_train

batch_size, num_notes = note_labels.shape
assert num_notes == model.num_labels
chord_embeddings = model.encode_note_labels(note_labels)
assert chord_embeddings.shape == audio_embeddings.shape
# Normalize embeddings and compute logits.
normalized_audio_embeddings = audio_embeddings / torch.norm(audio_embeddings, p=2, dim=1, keepdim=True)
normalized_chord_embeddings = chord_embeddings / torch.norm(chord_embeddings, p=2, dim=1, keepdim=True)
unscaled_audio_to_chord_sim = torch.matmul(normalized_audio_embeddings, normalized_chord_embeddings.T)
audio_to_chord_sim = unscaled_audio_to_chord_sim * torch.exp(model.temperature)
chord_to_audio_sim = audio_to_chord_sim.T

logits = unscaled_audio_to_chord_sim

# Compute labels when there may be duplicates.
labels = (note_labels[:,None] == note_labels).all(2).type(torch.float32)
labels = labels / labels.sum(1)
# Compute loss across both axes.
loss_a = torch.nn.functional.cross_entropy(audio_to_chord_sim, labels)
loss_n = torch.nn.functional.cross_entropy(chord_to_audio_sim, labels.T)
loss = (loss_a + loss_n)/2
loss

tensor(0.2072, grad_fn=<DivBackward0>)

In [125]:
a=[]
for _ in range(10000):
    a.append(np.random.choice([1,2,3,4,5,6], p=[0.5, 0.25, 0.125, 0.0625, 0.03125, 0.03125]))
    
import collections
import pprint
pprint.pprint( dict( collections.Counter(a) ) )

{1: 4948, 2: 2563, 3: 1256, 4: 599, 5: 340, 6: 294}
