# DLSM Architecture Translation into TensorFlow - Attempt

## Utility Imports

In [1]:
!wget -O shapedata.py https://raw.githubusercontent.com/interactive-intelligence/emergent-lang/main/shapedata.py
import shapedata
import importlib
importlib.reload(shapedata)

!wget -O analyzeutil.py https://raw.githubusercontent.com/interactive-intelligence/emergent-lang/main/analyzeutil.py
import analyzeutil
importlib.reload(analyzeutil)

## Library Imports

In [2]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import layers as L
import tensorflow_probability as tfp
import tensorflow as tf
import tensorflow
import numpy as np

## Model Definition

In [15]:
class VectorQuantizer(layers.Layer):
    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs):
        super().__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.beta = (
            beta  # This parameter is best kept between [0.25, 2] as per the paper.
        )

        # Initialize the embeddings which we will quantize.
        w_init = tf.random_uniform_initializer()
        self.embeddings = tf.Variable(
            initial_value=w_init(
                shape=(self.embedding_dim, self.num_embeddings), dtype="float32"
            ),
            trainable=True,
            name="embeddings_vqvae",
        )

    def call(self, x):
        # Calculate the input shape of the inputs and
        # then flatten the inputs keeping `embedding_dim` intact.
        input_shape = tf.shape(x)
        flattened = tf.reshape(x, [-1, self.embedding_dim])

        # Quantization.
        encoding_indices = self.get_code_indices(flattened)
        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
        quantized = tf.reshape(quantized, input_shape)

        # Calculate vector quantization loss and add that to the layer. You can learn more
        # about adding losses to different layers here:
        # https://keras.io/guides/making_new_layers_and_models_via_subclassing/. Check
        # the original paper to get a handle on the formulation of the loss function.
        commitment_loss = self.beta * tf.reduce_mean(
            (tf.stop_gradient(quantized) - x) ** 2
        )
        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
        self.add_loss(commitment_loss + codebook_loss)

        # Straight-through estimator.
        quantized = x + tf.stop_gradient(quantized - x)
        return quantized

    def get_code_indices(self, flattened_inputs):
        # Calculate L2-normalized distance between the inputs and the codes.
        similarity = tf.matmul(flattened_inputs, self.embeddings)
        distances = (
            tf.reduce_sum(flattened_inputs ** 2, axis=1, keepdims=True)
            + tf.reduce_sum(self.embeddings ** 2, axis=0)
            - 2 * similarity
        )

        # Derive the indices for minimum distances.
        encoding_indices = tf.argmin(distances, axis=1)
        return encoding_indices



class DLSM(tf.keras.Model):

    def __init__(self, 
                 inp_shape,           # the dimension of the image in three-element tuple form
                 seq_len=16,          # number of vectors to form a language sequence
                 vocab_size=32,       # number of unique vectors in vector quantizer
                 recurrent_units=32,  # number of gru units, also embedding dim for now
                 batch_size=32):      # number of samples per batch
        
        super().__init__()
        
        # structural params
        self.batch_size = batch_size
        self.inp_shape = inp_shape
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.recurrent_units = recurrent_units
        
        # model components
        self.vision_module = self.buildVisionModule(inp_shape)
        self.speaker = self.buildSpeaker((self.seq_len, 1))
        self.quantizer = VectorQuantizer(num_embeddings=self.vocab_size,
                                         embedding_dim=self.recurrent_units)
        self.listener = self.buildListener((self.seq_len, self.recurrent_units))
        
        # misc
        self.initial_speech = K.variable(value=np.zeros((self.batch_size, self.seq_len, 1)))
    
    def buildVisionModule(self, inp_shape):
        '''
        Vision Module - maps images to a vector with length recurrent_units.
        '''
        
        model = keras.models.Sequential(name='Vision_Module')
        model.add(L.Input(inp_shape))

        model.add(L.Conv2D(16, (5, 5), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.MaxPooling2D((2, 2)))

        model.add(L.Conv2D(16, (3, 3), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.MaxPooling2D((2, 2)))

        model.add(L.Flatten())
        model.add(L.Dense(self.recurrent_units, activation='relu'))
        model.add(L.BatchNormalization())
        
        return model
        
    def buildSpeaker(self, inp_shape):
        '''
        Speaker - takes in initial speech vector and uses image
        vector as initial hidden state. Outputs a sequence of vectors
        that are quantized.
        '''
        
        init_speech = L.Input(inp_shape)
        init_state = L.Input((self.recurrent_units,))
        gru = L.GRU(self.recurrent_units, return_sequences=True)(init_speech, initial_state=init_state)
        
        return keras.models.Model(inputs={'init_speech':init_speech, 'init_state':init_state},
                                  outputs=gru)

    def buildListener(self, inp_shape):
        '''
        Listener - takes in the quantized 'language' and outputs a single
        scalar probability.
        '''
        
        init_speech = L.Input(inp_shape)
        init_state = L.Input((self.recurrent_units,))
        gru = L.GRU(self.recurrent_units)(init_speech, initial_state=init_state)
        predense = L.Dense(32, activation='relu')(gru)
        out = L.Dense(1, activation='sigmoid')(predense)
        
        
        return keras.models.Model(inputs=[init_speech, init_state],
                                  outputs=out)
    
    def call(self, inputs, training=True):
        
        # split data into half, Yegor-style
        half = len(inputs) // 2
        xa, xb = inputs[:half], inputs[half:]
        
        # get vision vectors
        vision_a = self.vision_module(xa)
        vision_b = self.vision_module(xb)
        
        # obtain spoken vectors {'init_speech':init_speech, 'init_state':init_state}
        spoken_a = self.speaker({'init_speech':self.initial_speech, 
                                 'init_state':vision_a})
        spoken_b = self.speaker({'init_speech':self.initial_speech, 
                                 'init_state':vision_b})
        
        # quantize speech
        quantized_a = self.quantizer(spoken_a)
        quantized_b = self.quantizer(spoken_b)
        
        # obtain output vectors after listening
        listened_a = self.listener([quantized_a, vision_b])
        listened_b = self.listener([quantized_b, vision_a])
        
        return listened_a, listened_b

## Training

In [20]:
import keras.backend as K

'''
Core data parameters
'''
BATCH_SIZE = 512
SEQ_LEN = 4
VOCAB_SIZE = 10
RECURRENT_UNITS = 64

IMG_DIM = 64
MIN_SHAPES = 1
MAX_SHAPES = 1
SHAPE_TYPES = ['square', 'circle', 'triangle']
COLOR_TYPES = [(255,0,0), (0,255,  0), (0,0,255)]
OUTLINE = (255, 255, 255)
SHAPE_SCALE = 0.5

# create dataset
data = shapedata.ShapeData(batch_size=BATCH_SIZE, 
                           im_size=IMG_DIM, 
                           min_shapes=MIN_SHAPES, 
                           max_shapes=MAX_SHAPES,
                           outline = OUTLINE,
                           shape_types = SHAPE_TYPES,
                           shape_colors = COLOR_TYPES,
                           shape_scale = SHAPE_SCALE)

# create relevant training artifacts
optimizer = tensorflow.keras.optimizers.Adam(learning_rate=1e-2)
bce = tensorflow.keras.losses.BinaryCrossentropy()
model = DLSM((IMG_DIM, IMG_DIM, 3), 
             seq_len=SEQ_LEN,
             vocab_size=VOCAB_SIZE,
             recurrent_units=RECURRENT_UNITS,
             batch_size=BATCH_SIZE)

# train batch function
@tf.function
def train_batch(x, y):
    with tf.GradientTape() as tape:
        half = len(y) // 2
        outa, outb = model(x, training=True)
        loss = tf.math.divide(tf.math.add(bce(y, outa), bce(y, outb)), 2) # use avg bce as loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [21]:
shapedata.demo_dataset(data)

In [None]:
NUM_EPOCHS = 1000

losses = []
for epoch in range(NUM_EPOCHS):
    (x1, x1_shapes), (x2, x2_shapes), y = data.create_batch()
    loss = train_batch(np.concatenate([x1, x2]), np.expand_dims(y,1)).numpy()
    print(f'BATCH {epoch}: {loss}', end = '\r')
    losses.append(loss)
    
plt.figure(figsize=(10, 5), dpi=400)
plt.plot(losses, color='red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
plt.close()