In [4]:
%load_ext autoreload
%autoreload 2

from vlstm_spec import *
from music_dataset import TorchMusicDataset

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchaudio.transforms as transforms

import matplotlib.pyplot as plt
import numpy as np

import numpy as np
from scipy.io.wavfile import write
    
from scipy.signal import resample, firwin, lfilter

import librosa


dataset = TorchMusicDataset("../dataset/cleaned_data/cleaned_data")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def downsample_audio(audio, original_rate, new_rate):
    factor = original_rate // new_rate
    
    nyquist_rate = original_rate / 2
    cutoff_freq = new_rate / 2
    numtaps = 101 
    fir_coeff = firwin(numtaps, cutoff_freq / nyquist_rate)
    filtered_audio = lfilter(fir_coeff, 1.0, audio)
    
    downsampled_audio = filtered_audio[::factor]
    
    return downsampled_audio

In [3]:
songs = []
for i in range(100):
    obj = dataset.__getitem__(i)
    song = obj['time_series'].numpy()
    downsampled_song = downsample_audio(song, 44100, 11025)
    songs.append(downsampled_song[:11025*5])
    

In [4]:
batch_size = 4
n_in = songs[0].shape[0]
hidden_size = 5
songs = np.array(songs)
sample_rate = 22050
features = 32
latent_dim = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
dataloader = create_dataloader(songs, sample_rate, batch_size, n_in)
    
vae = VAE(n_in, features, latent_dim, device, hidden_size).to(device)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)
vae.optimizer = optimizer

lossGraph = []
# Training loop
epochs = 100
for epoch in range(epochs):
    epoch_loss = 0
    num_batches = 0
    
    for batch_x, _ in dataloader:
        batch_x = batch_x.to(device)
        loss = vae.training_step(batch_x)
        
        epoch_loss += loss
        num_batches += 1
    
    average_loss = epoch_loss / num_batches
    lossGraph.append(average_loss)
    
    print(f'Epoch {epoch+1}, Average Loss: {average_loss:.4f}')

Epoch 1, Average Loss: 265.4140
Epoch 2, Average Loss: 265.3775
Epoch 3, Average Loss: 265.3563
Epoch 4, Average Loss: 265.3312
Epoch 5, Average Loss: 265.3103
Epoch 6, Average Loss: 265.2962
Epoch 7, Average Loss: 265.2763
Epoch 8, Average Loss: 265.2633
Epoch 9, Average Loss: 265.2533
Epoch 10, Average Loss: 265.2453
Epoch 11, Average Loss: 265.2357
Epoch 12, Average Loss: 265.2307
Epoch 13, Average Loss: 265.2220
Epoch 14, Average Loss: 265.2194
Epoch 15, Average Loss: 265.2072
Epoch 16, Average Loss: 265.2061
Epoch 17, Average Loss: 265.2024
Epoch 18, Average Loss: 265.2035
Epoch 19, Average Loss: 265.2007
Epoch 20, Average Loss: 265.2029
Epoch 21, Average Loss: 265.1981
Epoch 22, Average Loss: 265.1974
Epoch 23, Average Loss: 265.2010
Epoch 24, Average Loss: 265.1954
Epoch 25, Average Loss: 265.1953
Epoch 26, Average Loss: 265.2039
Epoch 27, Average Loss: 265.1925
Epoch 28, Average Loss: 265.1962
Epoch 29, Average Loss: 265.1948
Epoch 30, Average Loss: 265.1953
Epoch 31, Average L

In [5]:
torch.save(vae, 'vlstm_spec.pt')

In [15]:
from scipy.io.wavfile import write

num_samples = 1
generated_sequences = vae.generate_sequences(num_samples)

spec = generated_sequences[0]
mel_inverted = librosa.feature.inverse.mel_to_stft(spec, sr=11025, n_fft=2048)
audio_array = librosa.griffinlim(mel_inverted, n_iter=32, hop_length=512, win_length=2048)
audio_array = audio_array / np.max(np.abs(audio_array))
audio_array = (audio_array * 32767).astype(np.int16)
sample_rate = 11025
output_filename = 'gen_audio_vlstm_spec_.wav'

write(output_filename, sample_rate, audio_array)

print(f"Audio saved as {output_filename}")

Audio saved as gen_audio_vlstm_spec_.wav


In [12]:
generated_sequences.shape

(1, 55125, 32)

In [13]:
audio_array.shape

(15872,)