In [1]:
import librosa

In [8]:
audio_path = librosa.example('trumpet')
wv, sr = librosa.load(audio_path, sr=44100)

from music2latent import EncoderDecoder
encdec = EncoderDecoder()

latent = encdec.encode(wv)
# latent has shape (batch_size/audio_channels, dim (64), sequence_length)

wv_rec = encdec.decode(latent)

In [9]:
print(wv.shape, "wv.shape")
print("duration in seconds:", wv.shape[0] / sr)
print(latent.shape, "latent.shape")
print(wv_rec.shape, "wv_rec.shape")

features = encdec.encode(wv, extract_features=True)
print(features.shape, "features.shape")


# print some statistics on torch tensors
print('mean:', latent.mean(), 'std:', latent.std(), 'min:', latent.min(), 'max:', latent.max())
print('mean:', features.mean(), 'std:', features.std(), 'min:', features.min(), 'max:', features.max())


(235202,) wv.shape
duration in seconds: 5.333378684807256
torch.Size([1, 64, 57]) latent.shape
torch.Size([1, 235008]) wv_rec.shape
torch.Size([1, 8192, 57]) features.shape
mean: tensor(-0.0793) std: tensor(1.3069) min: tensor(-4.6763) max: tensor(4.7329)
mean: tensor(-0.0209) std: tensor(0.9941) min: tensor(-12.1267) max: tensor(12.3214)


In [10]:
import torch
import torchaudio.transforms as T

# dur: 1.0
# offset: 0.2
# norm: 0.95
# win_len: 1024
# hop_len: 512
# n_mels: 64
# n_fft: 1024

# create a mel spectrogram
mel_transform = T.MelSpectrogram(sample_rate=sr, n_fft=1024, win_length=1024, hop_length=512, n_mels=64)
wv = torch.tensor(wv).unsqueeze(0)
mel = mel_transform(wv)

In [11]:
print(mel.shape)
print("frames per second:", sr / 512)

torch.Size([1, 64, 460])
frames per second: 86.1328125
