In [1]:
import tensorflow as tf
import numpy as np
import matplotlib as mpl
import IPython.display as display
import PIL.Image
from scipy.io import wavfile
from scipy.io.wavfile import write
import cv2

2023-05-09 19:07:53.361419: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Choose an image to dream-ify

For this tutorial, let's use an image of a [labrador](https://commons.wikimedia.org/wiki/File:YellowLabradorLooking_new.jpg).

In [29]:
# Download an image and read it into a NumPy array.
def load(filePath, segment):
  # read the stereo audio file
  _, a = wavfile.read(filePath)
  stereo_signal = np.array(a, dtype=np.float32)

  # split the stereo signal into left and right channels
  stereo_signal[:, 0] = stereo_signal[:, 0] / 32768.0
  stereo_signal[:, 1] = stereo_signal[:, 1] / 32768.0
  whole_left = stereo_signal[:, 0][segment * 309000 : ((segment + 1) * 309000)]
  whole_right = stereo_signal[:, 0][segment * 309000 : ((segment + 1) * 309000)]
  left_channel = whole_left[:103000]
  right_channel = whole_right[:103000]

  left_2 = whole_left[103000:206000]
  right_2 = whole_right[103000:206000]

  left_3 = whole_left[206000:309000]
  right_3 = whole_right[206000:309000]

  res1 = np.vstack((left_channel, right_channel))
  res1 = np.reshape(res1, (412, 500))
  res2 = np.vstack((left_2, right_2))
  res2 = np.reshape(res2, (412, 500))
  res3 = np.vstack((left_3, right_3))
  res3 = np.reshape(res3, (412, 500))
  fin = cv2.merge((res1,res2,res3))
  return fin

# Normalize an image
def deprocess(img):
  img = 255*(img + 1.0)/2.0
  return tf.cast(img, tf.uint8)

# Display an image
def show(img, seq):
  pilobj = PIL.Image.fromarray(np.array(img))
  display.display(pilobj)
  pilobj.save(str(seq) + ".jpg")


## Prepare the feature extraction model

Download and prepare a pre-trained image classification model. You will use [InceptionV3](https://keras.io/api/applications/inceptionv3/) which is similar to the model originally used in DeepDream. Note that any [pre-trained model](https://keras.io/api/applications/#available-models) will work, although you will have to adjust the layer names below if you change this.

In [3]:
base_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

2023-05-09 19:07:54.912769: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-09 19:07:54.937026: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


The idea in DeepDream is to choose a layer (or layers) and maximize the "loss" in a way that the image increasingly "excites" the layers. The complexity of the features incorporated depends on layers chosen by you, i.e, lower layers produce strokes or simple patterns, while deeper layers give sophisticated features in images, or even whole objects.

The InceptionV3 architecture is quite large (for a graph of the model architecture see TensorFlow's [research repo](https://github.com/tensorflow/models/tree/master/research/slim)). For DeepDream, the layers of  interest are those where the convolutions are concatenated. There are 11 of these layers in InceptionV3, named 'mixed0' though 'mixed10'. Using different layers will result in different dream-like images. Deeper layers respond to higher-level features (such as eyes and faces), while earlier layers respond to simpler features (such as edges, shapes, and textures). Feel free to experiment with the layers selected below, but keep in mind that deeper layers (those with a higher index) will take longer to train on since the gradient computation is deeper.

In [4]:
# Maximize the activations of these layers
names = ['mixed3', 'mixed5']
layers = [base_model.get_layer(name).output for name in names]

# Create the feature extraction model
dream_model = tf.keras.Model(inputs=base_model.input, outputs=layers)

## Calculate loss

The loss is the sum of the activations in the chosen layers. The loss is normalized at each layer so the contribution from larger layers does not outweigh smaller layers. Normally, loss is a quantity you wish to minimize via gradient descent. In DeepDream, you will maximize this loss via gradient ascent.

In [5]:
def calc_loss(img, model):
  # Pass forward the image through the model to retrieve the activations.
  # Converts the image into a batch of size 1.
  img_batch = tf.expand_dims(img, axis=0)
  layer_activations = model(img_batch)
  if len(layer_activations) == 1:
    layer_activations = [layer_activations]

  losses = []
  for act in layer_activations:
    loss = tf.math.reduce_mean(act)
    losses.append(loss)

  return  tf.reduce_sum(losses)

## Gradient ascent

Once you have calculated the loss for the chosen layers, all that is left is to calculate the gradients with respect to the image, and add them to the original image. 

Adding the gradients to the image enhances the patterns seen by the network. At each step, you will have created an image that increasingly excites the activations of certain layers in the network.

The method that does this, below, is wrapped in a `tf.function` for performance. It uses an `input_signature` to ensure that the function is not retraced for different image sizes or `steps`/`step_size` values. See the [Concrete functions guide](../../guide/function.ipynb) for details.

In [6]:
class DeepDream(tf.Module):
  def __init__(self, model):
    self.model = model

  @tf.function(
      input_signature=(
        tf.TensorSpec(shape=[None,None,3], dtype=tf.float32),
        tf.TensorSpec(shape=[], dtype=tf.int32),
        tf.TensorSpec(shape=[], dtype=tf.float32),)
  )
  def __call__(self, img, steps, step_size):
      print("Tracing")
      loss = tf.constant(0.0)
      for n in tf.range(steps):
        with tf.GradientTape() as tape:
          # This needs gradients relative to `img`
          # `GradientTape` only watches `tf.Variable`s by default
          tape.watch(img)
          loss = calc_loss(img, self.model)

        # Calculate the gradient of the loss with respect to the pixels of the input image.
        gradients = tape.gradient(loss, img)

        # Normalize the gradients.
        gradients /= tf.math.reduce_std(gradients) + 1e-8 
        print(n, steps, gradients)
        # In gradient ascent, the "loss" is maximized so that the input image increasingly "excites" the layers.
        # You can update the image by directly adding the gradients (because they're the same shape!)
        img = img + gradients*step_size
        # img = tf.clip_by_value(img, -1, 1)

      return loss, img

In [7]:
deepdream = DeepDream(dream_model)

## Main Loop

In [22]:
def run_deep_dream_simple(img, steps=100, step_size=0.01):
  # Convert from uint8 to the range expected by the model.
  img = tf.keras.applications.inception_v3.preprocess_input(img)
  img = tf.convert_to_tensor(img)
  step_size = tf.convert_to_tensor(step_size)
  steps_remaining = steps
  step = 0
  while steps_remaining:
    if steps_remaining>100:
      run_steps = tf.constant(100)
    else:
      run_steps = tf.constant(steps_remaining)
    steps_remaining -= run_steps
    step += run_steps
  loss, img = deepdream(img, steps, tf.constant(step_size))
  vis = tf.clip_by_value(img, -1, 1)
  display.clear_output(wait=True)
  print ("Step {}, loss {}".format(step, loss))
  return img

In [31]:
# Downsizing the image makes it easier to work with.
fin = np.zeros((2, 0))
samplingRate = 48000
filePath = "rickroll.wav"
name = filePath.split(".")[0]
_, stereo_signal = wavfile.read(filePath)
iter = int(len(stereo_signal[:, 0]) / 309000) 
for i in range(0, iter):
  original_img = load(filePath, segment=i)
  show(deprocess(original_img), i)
  dream_img = run_deep_dream_simple(img=original_img, steps=1, step_size=1e-11)
  dream_img = dream_img.numpy()
  dream_img = dream_img.flatten()
  dream_img = np.reshape(dream_img, (2, 309000))
  fin = np.hstack((fin, dream_img))
fin = (fin * 32767).astype(np.int16)
write(name + "-output.wav", samplingRate, fin.T)


Step 1, loss 0.3715117573738098


### Post processing

Need to cut down the amplitude/volume because it always destroys my ears if I don't clean it up in Audacity

In [32]:
import wave, audioop
factor = 0.125
with wave.open(name + '-output.wav', 'rb') as wav:
    p = wav.getparams()
    with wave.open(name + '-output-corrected.wav', 'wb') as audio:
        audio.setparams(p)
        frames = wav.readframes(p.nframes)
        audio.writeframesraw(audioop.mul(frames, p.sampwidth, factor))