In [23]:
# modified from https://github.com/SherlockLiao/Deep-Dream/blob/master/show_image.ipynb
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image
from deepDream import dream
import torch.nn as nn
import spectroUtils as sUtils
import IPython.display as ipd
import librosa
import librosa.display
import numpy as np

In [24]:
#Original hip-hopaudio
trackId = 2
spectroSize = 224
minAmp = -80.0
maxAmp = 1.9073486e-06
audioDirectory = "data/fma_small/"
audioFilename = audioDirectory + "000/000002.mp3"
x, sr = librosa.load(audioFilename, sr=None, mono=True, duration = 10, offset = 0)
start, end = 0, 30
ipd.Audio(data=x[start*sr:end*sr], rate=sr)

In [25]:
scaledSpectro, phase, origSpectroSize = sUtils.loadSpectro(trackId, spectroSize, minAmp, maxAmp)
scaledSpectro = scaledSpectro.data.numpy()
#scaledSpectro = scaledSpectro/255.0
scaledSpectro

array([[[130., 171., 179., ..., 210., 175., 180.],
        [172., 186., 190., ..., 208., 196., 186.],
        [194., 199., 198., ..., 209., 208., 213.],
        ...,
        [ 53.,  51.,  63., ...,  66.,  45., 106.],
        [ 41.,  46.,  71., ...,  65.,  59., 106.],
        [ 53.,  62.,  68., ...,  63.,  56., 107.]]])

In [26]:
model = models.__dict__['resnet18']()
model = torch.nn.DataParallel(model)
model

DataParallel (
  (module): ResNet (
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
    (relu): ReLU (inplace)
    (maxpool): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
    (layer1): Sequential (
      (0): BasicBlock (
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (relu): ReLU (inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      )
      (1): BasicBlock (
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (relu): ReLU (inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride

In [27]:
checkpoint_path = 'resnet18/checkpoints/model_best.pth.tar'
print("=> loading checkpoint '{}'".format(checkpoint_path))
checkpoint = torch.load(checkpoint_path)
start_epoch = checkpoint['epoch']
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch: {}) (best_acc: {})"
        .format(checkpoint_path, checkpoint['epoch'], checkpoint['best_prec1']))

=> loading checkpoint 'resnet18/checkpoints/model_best.pth.tar'
=> loaded checkpoint 'resnet18/checkpoints/model_best.pth.tar' (epoch: 57) (best_acc: 50.57766367627047)


In [28]:
model = nn.Sequential(*list(model.children())[:-3])
model

Sequential (
)

In [29]:
dreamSpectro = dream(model, scaledSpectro)
dreamSpectro = dreamSpectro[0,:]
print ("dreamSpectro")
print (dreamSpectro)

(1, 41, 41)
(1, 58, 58)
(1, 81, 81)
(1, 114, 114)
(1, 160, 160)
(1, 224, 224)
out
[[[132.24579  173.63672  181.85242  ... 213.31937  178.16397  183.10974 ]
  [174.62457  188.84093  192.99095  ... 211.38005  199.30893  189.2272  ]
  [196.8287   201.94756  201.04967  ... 212.42455  211.39188  216.37177 ]
  ...
  [ 53.893715  51.938698  64.010376 ...  67.299095  46.402164 107.83752 ]
  [ 41.85858   46.927654  72.03723  ...  66.291794  60.43766  107.84054 ]
  [ 53.91568   62.997093  69.041046 ...  64.27949   57.43031  108.848755]]]
dreamSpectro
[[132.24579  173.63672  181.85242  ... 213.31937  178.16397  183.10974 ]
 [174.62457  188.84093  192.99095  ... 211.38005  199.30893  189.2272  ]
 [196.8287   201.94756  201.04967  ... 212.42455  211.39188  216.37177 ]
 ...
 [ 53.893715  51.938698  64.010376 ...  67.299095  46.402164 107.83752 ]
 [ 41.85858   46.927654  72.03723  ...  66.291794  60.43766  107.84054 ]
 [ 53.91568   62.997093  69.041046 ...  64.27949   57.43031  108.848755]]




In [30]:
#Convert it back to 
backToSpectro = sUtils.unscaleSpectro(dreamSpectro, origSpectroSize, minAmp, maxAmp)
# Multiply power spectrogram with phase to get original complex spectrogram
backToSpectro = backToSpectro*phase

# Revert complex spectrogram to audio
n = len(x)
y_hat = librosa.istft(backToSpectro, length = n)
ipd.Audio(data=y_hat[start*sr:end*sr], rate=sr)

## Control The Dream

In [31]:
otherTrackId = 690
otherSpectro, otherPhase, otherOrigSpectroSize = sUtils.loadSpectro(otherTrackId, spectroSize, minAmp, maxAmp)
otherSpectro = otherSpectro.data.numpy()

In [32]:
model = models.__dict__['resnet18']()
model = torch.nn.DataParallel(model)
checkpoint_path = 'resnet18/checkpoints/model_best.pth.tar'
print("=> loading checkpoint '{}'".format(checkpoint_path))
checkpoint = torch.load(checkpoint_path)
start_epoch = checkpoint['epoch']
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch: {}) (best_acc: {})"
        .format(checkpoint_path, checkpoint['epoch'], checkpoint['best_prec1']))
model = nn.Sequential(*list(model.children())[:-3])

=> loading checkpoint 'resnet18/checkpoints/model_best.pth.tar'
=> loaded checkpoint 'resnet18/checkpoints/model_best.pth.tar' (epoch: 57) (best_acc: 50.57766367627047)


In [33]:
control_features = model.forward(otherSpectro)

In [38]:
def objective_guide(dst, guide_features):
    x = dst.data[0].cpu().numpy().copy()
    #print ("guide_features", guide_features.shape)
    y = guide_features
    w, h = x.shape
    ch = 1
    x = x.reshape(ch,-1)
    y = y.reshape(ch,-1)
    A = x.T.dot(y) # compute the matrix of dot-products with guide features
    result = y[:,A.argmax(1)] # select ones that match best
    if torch.cuda.is_available():
        result = torch.Tensor(np.array([result.reshape(ch, w, h)], dtype=np.float)).cuda()
    else:
        result = torch.Tensor(np.array([result.reshape(ch, w, h)], dtype=np.float))
    return result

In [39]:
hopRockDreamSpectro = dream(model, scaledSpectro, control=control_features, distance=objective_guide)
print(hopRockDreamSpectro)



(1, 41, 41)


  img_variable.data.add_(img_variable.grad.data * learning_rate_use)


(1, 58, 58)
(1, 81, 81)
(1, 114, 114)
(1, 160, 160)
(1, 224, 224)
out
[[[132.40051  173.40053  181.40053  ... 212.40051  177.40051  182.40051 ]
  [174.40051  188.40051  192.40051  ... 210.40051  198.40051  188.40051 ]
  [196.40051  201.40051  200.40051  ... 211.40051  210.40051  215.40051 ]
  ...
  [ 55.400055  53.40005   65.39994  ...  68.399796  47.399815 108.3996  ]
  [ 43.400055  48.400032  73.39993  ...  67.39981   61.399826 108.3996  ]
  [ 55.400055  64.39991   70.39991  ...  65.39982   58.399826 109.3996  ]]]
[[[132.40051  173.40053  181.40053  ... 212.40051  177.40051  182.40051 ]
  [174.40051  188.40051  192.40051  ... 210.40051  198.40051  188.40051 ]
  [196.40051  201.40051  200.40051  ... 211.40051  210.40051  215.40051 ]
  ...
  [ 55.400055  53.40005   65.39994  ...  68.399796  47.399815 108.3996  ]
  [ 43.400055  48.400032  73.39993  ...  67.39981   61.399826 108.3996  ]
  [ 55.400055  64.39991   70.39991  ...  65.39982   58.399826 109.3996  ]]]


In [41]:
hopRockDreamSpectro = hopRockDreamSpectro[0,:]
#Convert it back to 
backToSpectro = sUtils.unscaleSpectro(hopRockDreamSpectro, origSpectroSize, minAmp, maxAmp)
# Multiply power spectrogram with phase to get original complex spectrogram
backToSpectro = backToSpectro*phase

# Revert complex spectrogram to audio
n = len(x)
y_hat = librosa.istft(backToSpectro, length = n)
ipd.Audio(data=y_hat[start*sr:end*sr], rate=sr)