In [None]:
# imports

import torchaudio
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb, rgb_to_hsv
from IPython.display import Audio
from tqdm import tqdm
import os
from PIL import Image

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

# CPPN implementation

Notes:
- increasing the number of layers increases visual complexity and sharpness
- increasing the hidden size also increases complexity/sharpness

In [None]:
# CPPN network

def CPPN(hidden_size=32, latent_size=20):

    model = nn.Sequential(
        nn.Linear(3 + latent_size, hidden_size),
        nn.Tanh(),
        nn.Linear(hidden_size, hidden_size),
        nn.Tanh(),
        nn.Linear(hidden_size, hidden_size),
        nn.Tanh(),
        nn.Linear(hidden_size, hidden_size),
        nn.Tanh(),
        nn.Linear(hidden_size, hidden_size),
        nn.Tanh(),
        # nn.Linear(hidden_size, hidden_size),
        # nn.Tanh(),
        # nn.Linear(hidden_size, hidden_size),
        # nn.Tanh(),
        # nn.Linear(hidden_size, hidden_size),
        # nn.Tanh(),
        nn.Linear(hidden_size, 3),
        nn.Sigmoid(),
    ).to(device)

    # re-initialize linear layers with standard Gaussian distribution
    with torch.no_grad():
        for i in range(0, len(model), 2):
            model[i].weight = torch.nn.Parameter(torch.randn_like(model[i].weight))

    return model

model = CPPN()
print(model)

In [None]:
# helper function to generate inputs to CPPN

def gen_cppn_input(batch_size=1, x_dim=32, y_dim=32, scale=1.0, latents=None, latent_scale=1.0):
    '''
    calculates and returns a vector of x and y coordintes, and corresponding radius from the centre of image.
    latents is a vector of size (batch_size x latent_size)
    TODO finish documentation
    '''
    n_points = x_dim * y_dim
    x_range = scale*(np.arange(x_dim)-(x_dim-1)/2.0)/(x_dim-1)/0.5
    y_range = scale*(np.arange(y_dim)-(y_dim-1)/2.0)/(y_dim-1)/0.5
    x_mat = np.matmul(np.ones((y_dim, 1)), x_range.reshape((1, x_dim)))
    y_mat = np.matmul(y_range.reshape((y_dim, 1)), np.ones((1, x_dim)))
    r_mat = np.sqrt(x_mat*x_mat + y_mat*y_mat)
    x_mat = np.tile(x_mat.flatten(), batch_size).reshape(batch_size, n_points, 1)
    y_mat = np.tile(y_mat.flatten(), batch_size).reshape(batch_size, n_points, 1)
    r_mat = np.tile(r_mat.flatten(), batch_size).reshape(batch_size, n_points, 1)
    r_mat = np.zeros_like(r_mat)
    print(np.max(r_mat))

    assert len(latents) == batch_size
    latent_size = latents.shape[1]
    latents *= latent_scale
    latents_tiled = np.tile(np.reshape(latents, [batch_size, 1, latent_size]), [1, n_points, 1])

    input = np.concatenate([x_mat, y_mat, r_mat, latents_tiled], axis=2)
    return torch.tensor(input, dtype=torch.float32).to(device)

In [None]:
def load_training_image(img_path, latent_size, coord_scale=1, resolution=200, use_hsv=False):
    raw_img = Image.open(img_path).convert('RGB')
    raw_width, raw_height = raw_img.size
    aspect = raw_width / raw_height
    raw_img = raw_img.resize((int(resolution * aspect), resolution));
    img = np.array(raw_img) / 255.
    if use_hsv:
        img = rgb_to_hsv(img)
    width, height = img.shape[1], img.shape[0] 
    input = gen_cppn_input(batch_size=1, x_dim=width, y_dim=height, scale=coord_scale, latents=np.zeros((1, latent_size)), latent_scale=1)[0]
    labels = torch.tensor(img, dtype=torch.float).reshape((width * height, 3))

    return input, labels


# Audio I/O and source separation


In [None]:
# download audio file from youtube
# to use your own audio file, upload it as /content/audio.wav
url = "https://www.youtube.com/watch?v=BtvJaNeELic"

!pip install youtube-dl
!youtube-dl -f "bestaudio[ext=m4a]" {url} --output "audio.m4a"
!ffmpeg -i audio.m4a audio.wav -loglevel 0

In [None]:
# separate into vocals+accompaniment
# IMPORTANT: only works on CPU (if using GPU on the notebook, perform source separation on a separate CPU instance then upload the separated audio files)
if device == "cpu":
    !pip install spleeter
    !spleeter separate audio.wav
else:
    print("Please use a CPU machine to perform source separation")
    

# Feature extraction 

In [None]:
# audio parameters
audio_path = 'vocals.wav' # REPLACE with 'accompaniment.wav' for the accompaniment
target_fps = 30 # number of frames per second for output video
latent_size = 16 # frequency resolution (number of mel bins) for audio frame vectors

# load audio file
audio, sr = torchaudio.load(audio_path, normalize=True)
audio = audio[0]
print('sample rate', sr)
print('audio shape', audio.shape)

# feature extraction with a Mel spectrogram
hop_length = int(sr / target_fps)
transform = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=hop_length*2, n_mels=latent_size, hop_length=hop_length, win_length=hop_length * 2)
spectrogram = transform(audio)
spectrogram = spectrogram.T
spectrogram = spectrogram / torch.max(spectrogram) # normalize to range [0,1]
print('spectrogram shape', spectrogram.shape)

# visualize spectrogram
plt.figure(figsize=(15, 4))
plt.imshow(spectrogram.T[:, 0:200])

# play audio
# Audio(data=audio, rate=sr)

# Generate Output Video

run this cell multiple times to get a good looking base image

In [None]:
# output video parameters
x_dim = 400
y_dim = 400

# cppn parameters
coord_scale = 0.2
latent_scale = 0.5
use_hsv = False # whether to used the HSV color space, usually True for abstract art and False for using training images

# initialize CPPN
model = CPPN(hidden_size=32, latent_size=latent_size)

# generate test image with zero latent vector
input = gen_cppn_input(batch_size=1, x_dim=x_dim, y_dim=y_dim, scale=coord_scale, latents=np.zeros([1, latent_size]), latent_scale = latent_scale)
output = model(input)
output_img = torch.reshape(output, [y_dim, x_dim, 3]).cpu().detach().numpy()
if use_hsv:
    output_img = hsv_to_rgb(output_img)
plt.imshow(output_img)
print('base image')

In [None]:
# train CPPN to match a reference image
from IPython.display import clear_output

image = 'vocals.png' # REPLACE with 'accompaniment.png' for the accompaniment image

criterion = nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4);

inputs, labels = load_training_image(image, latent_size, coord_scale=coord_scale, use_hsv=use_hsv)
inputs = inputs.to(device)
labels = labels.to(device)

n_epochs = 1000 # don't set this too high, otherwise model will overfit and latent vector will not change anything
prev_loss = 0
model.train()
for epoch in range(n_epochs):
    loss = 0.0
    optimizer.zero_grad()
    output = model(inputs)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0: # display loss and show image every 100 epochs
        print(f'epoch {epoch}: loss={loss}, diff={loss - prev_loss}')
        prev_loss = loss
        plt.figure(1, figsize=(10, 5))
        output_img = torch.reshape(output, [200, 200, 3]).cpu().detach().numpy()
        if use_hsv:
            output_img = hsv_to_rgb(output_img)
        plt.imshow(output_img)
        plt.show()
        clear_output(wait=True)
print('Finished Training')

input = gen_cppn_input(batch_size=1, x_dim=x_dim, y_dim=y_dim, scale=coord_scale, latents=np.zeros([1, latent_size]), latent_scale = latent_scale)
output = model(input)
output_img = torch.reshape(output, [y_dim, x_dim, 3]).cpu().detach().numpy()
if use_hsv:
    output_img = hsv_to_rgb(output_img)
plt.imshow(output_img)
print('After Training')

In [None]:
# generate some test frames
num_frames = 8
latent_scale = 0.5

# test sequential frames to check smoothness
print('sequential frames (above)')

offset = 100
frames = spectrogram[offset:offset + num_frames, :]

input = gen_cppn_input(batch_size=num_frames, x_dim=x_dim, y_dim=y_dim, scale=coord_scale, latents=frames, latent_scale = latent_scale)
output = model(input)
output_imgs = torch.reshape(output, [num_frames, y_dim, x_dim, 3]).cpu().detach().numpy()

fig, axes = plt.subplots(1, num_frames, figsize=(20, 10))
for i, ax in enumerate(axes):
    if use_hsv:
        ax.imshow(hsv_to_rgb(output_imgs[i]))
    else:
        ax.imshow(output_imgs[i])

# test random frames to check variation
print('random frames (below)')
frame_indices = np.random.choice(spectrogram.shape[0], num_frames, replace=False)
frames = spectrogram[frame_indices, :]
input = gen_cppn_input(batch_size=num_frames, x_dim=x_dim, y_dim=y_dim, scale=coord_scale, latents=frames, latent_scale = latent_scale)
output = model(input)
output_imgs = torch.reshape(output, [num_frames, y_dim, x_dim, 3]).cpu().detach().numpy()

fig, axes = plt.subplots(1, num_frames, figsize=(20, 10))
for i, ax in enumerate(axes):
    if use_hsv:
        ax.imshow(hsv_to_rgb(output_imgs[i]))
    else:
        ax.imshow(output_imgs[i])

In [None]:
# generate all frames
output_dir = '/content/img/'
%rm -r /content/img/
os.makedirs(output_dir, exist_ok=True)

print('generating frames')
for i, embedding in tqdm(enumerate(spectrogram), total=len(spectrogram)):
    latents = embedding[np.newaxis, :]
    input = gen_cppn_input(batch_size=1, x_dim=x_dim, y_dim=y_dim, scale=coord_scale, latents=latents, latent_scale=latent_scale)
    output = model(input)
    img = output[0].cpu().detach().numpy().reshape([y_dim, x_dim, 3])
    if use_hsv:
        img = hsv_to_rgb(img)
    plt.imsave(f'/content/img/{i:04d}.png', img)

# this cell can be stopped after a desired number of frames have been generated

In [None]:
# generate video from image sequence
# values in {brackets} are python variables
print('generating output video')
!ffmpeg -r {target_fps} -f image2 -s {x_dim}x{y_dim} -i /content/img/%04d.png -vcodec libx264 -crf 25 -pix_fmt yuv420p -loglevel 0 -nostats temp.mp4

# add audio track
print('adding audio track')
%rm vocals.mp4
!ffmpeg -i temp.mp4 -i "vocals.wav" -c:v copy -map 0:v:0 -map 1:a:0 -loglevel error vocals.mp4
# REPLACE for accompaniment with:
# %rm accompaniment.mp4
# !ffmpeg -i temp.mp4 -i "accompaniment.wav" -c:v copy -map 0:v:0 -map 1:a:0 -loglevel error accompaniment.mp4
                          

# remove temporary files
%rm temp.mp4
%rm -r /content/img/


In [None]:
# combine the vocals and accompaniment videos
# to generate the video for the accompaniment, rerun all the cells after the section "Feature extraction" 
# while making the replacements in the comments "REPLACE"
!ffmpeg -i vocals.mp4 -i accompaniment.mp4 -filter_complex hstack temp.mp4 -y
!ffmpeg -i temp.mp4 -i "audio.wav" -c:v copy -map 0:v:0 -map 1:a:0 -loglevel error output.mp4 -y
%rm temp.mp4
print('saved output video with audio to output.mp4')
# download and play test_audio.mp4 to see the results