In [1]:
import sys
import logging
import os
import math
import json
from tqdm import tqdm
import warnings
from IPython.display import display, Audio

import numpy as np
import librosa

import torch

import scdata

sys.path.append('..')
logging.getLogger().setLevel(logging.INFO)
warnings.filterwarnings('ignore')

from aural_travels.data import soundcloud
from aural_travels.train import visualizer

MODEL_DIR = '/home/leo/src/aural-travels/models/bottleneck_gen_max_dist1_enc6_dec3_layer_scale_dropout0.2input'
DATA_DIR = '/home/leo/src/scdata'

In [2]:
with open(os.path.join(MODEL_DIR, 'params.json')) as f:
    params = json.load(f)
    
params

{'soundcloud_data_dir': '/home/leo/src/scdata',
 'dataset': 'soundcloud',
 'num_workers': 64,
 'batch_size': 256,
 'gradient_accumulation': 2,
 'num_epochs': 2000,
 'hidden_size': 512,
 'num_layers': 8,
 'num_enc_layers': 6,
 'num_dec_layers': 3,
 'num_heads': 1,
 'attention_dropout': 0.2,
 'ffnn_dropout': 0.2,
 'audio_emb_dropout': 0.2,
 'input_dropout': 0.2,
 'lr': 0.0001,
 'output_dir': 'models/bottleneck_gen_max_dist1_enc6_dec3_layer_scale_dropout0.2input',
 'sample_secs': 2.0,
 'n_fft': 2048,
 'hop_length': 1024,
 'save_steps': 100,
 'eval_steps': 100,
 'encoding_dir': 'models/encoding/vqgan',
 'non_autoregressive': False,
 'corrupt_image_mode': None,
 'seed': 42,
 'toy_data': False,
 'expose_steps': None,
 'expose_alpha': 0.5,
 'contrastive_lambda': None,
 'pull_lambda': None,
 'axial_attention': False,
 'use_layer_scale': True,
 'global_features': False,
 'image_repr': 'vqgan',
 'model': 'bottleneck_gen'}

In [3]:
#split = 'training'
split = 'validation'

dataset = visualizer.load_dataset(params,
                                  split,
                                  torch.load(f'../models/encoding/{params["image_repr"]}/{split}.pt'))
image_repr = visualizer.create_image_repr(params)
model = visualizer.create_model(params, image_repr, dataset)
model.eval().to('cuda')

checkpoint = torch.load(os.path.join(MODEL_DIR, 'last_checkpoint.pt'))
model.load_state_dict(checkpoint['model'])

checkpoint['global_step']

Working with z of shape (1, 256, 16, 16) = 65536 dimensions.
loaded pretrained LPIPS loss from taming/modules/autoencoder/lpips/vgg.pth
VQLPIPSWithDiscriminator running with hinge loss.


KeyError: 'num_latents'

In [None]:
def show(image_seq, save=None):
    image = image_repr.tensor_to_image(image_repr.decode(image_seq)[0])
    display(image)
    if save:
        image.save(save)
    return image

corrupt_image_seq = image_repr.rand_image_seq(1, device='cuda')
show(corrupt_image_seq)

corrupt_image_seq = image_repr.rand_image_seq(1, device='cuda', patch_size=4)
show(corrupt_image_seq)

corrupt_image_seq = image_repr.zeros_image_seq(1, device='cuda')
show(corrupt_image_seq)

print('')

In [None]:
indices = list(range(100, 200))

for idx in indices:
    track = dataset.tracks[idx]
    print(f'{track["genre"]}: {track["title"]} ({track["id"]})')
    
    audio1, ref_image_seq = dataset[idx]
    
    audio1 = audio1[None, ...].to('cuda')
    #audio2 = audio2[None, ...].to('cuda')

    ref_image_seq = ref_image_seq[None, ...].to('cuda')

    print(idx, 'ref')
    show(ref_image_seq)
    
    for i in range(1):  
        image_seq = model.generate_image_seq(model.calc_audio_emb(audio1))
        show(image_seq)
        
        #image_seq = model.generate_image_seq(model.calc_audio_emb(audio2))
        #show(image_seq)

In [None]:
from IPython.display import Audio

track_idx = 126
path = scdata.get_audio_path(os.path.join(DATA_DIR, 'audio'), dataset.tracks[track_idx]['id'])
print(path)
print(dataset.tracks[track_idx])
Audio(path)

In [None]:
mel = torch.tensor(dataset.load_features(track_idx), dtype=torch.float)
mel = (mel - dataset.mfcc_mean) * dataset.mfcc_std_inv
print(mel.shape)
print(mel.shape[0] / 21)
print(model.audio_seq_len)

In [None]:
from IPython.display import clear_output
import time

from aural_travels import generate

frames_dir = os.path.join(MODEL_DIR, 'gen', 'frames') 
os.makedirs(frames_dir, exist_ok=True)

keyframes = generate.keyframes(model, mel, fps=5.0, top_k=1, device='cuda')
frames = generate.interpolate(model, keyframes, interframes=12)

start_time = time.time()

for i, frame in enumerate(frames):
    image = image_repr.tensor_to_image(frame)
    if i % 1 == 0:
        clear_output(wait=True)
        print(i, i/60, (i/60)/(time.time()-start_time))
        display(image)
        image.save(os.path.join(frames_dir, f'{i}.png'))

In [None]:
model