# DLSM Architecture Translation into TensorFlow - Attempt

## Utility Imports

In [1]:
import sys
sys.path.append('../input/emergentlang/emergent-lang')

import shapedata
import analyzeutil

## Library Imports

In [2]:
from tensorflow import keras
from tensorflow.keras import layers as L
import tensorflow as tf
import tensorflow
import numpy as np
import matplotlib.pyplot as plt

## Model Definition

In [57]:
class Categorical(L.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def call(self, x):
        return tf.random.categorical(tf.math.log(x), 1)

class VectorQuantizer(L.Layer):
    def __init__(self, vocab_size, seq_len, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.categorical = Categorical()

    def call(self, x):
        symbols = tf.math.argmax(x, axis=2)
        
        symbols = L.TimeDistributed(self.categorical)(x)
        
        symbols = L.Reshape((self.seq_len,))(symbols)
        
        quantized = tf.one_hot(symbols, self.vocab_size)
        quantized = x + tf.stop_gradient(quantized - x)
        return quantized



class DLSM(tf.keras.Model):

    def __init__(self, 
                 inp_shape,           # the dimension of the image in three-element tuple form
                 seq_len=16,          # number of vectors to form a language sequence
                 vocab_size=32,       # number of unique vectors in vector quantizer
                 recurrent_units=32,  # number of gru units, also embedding dim for now
                 batch_size=32):      # number of samples per batch
        
        super().__init__()
        
        # structural params
        self.batch_size = batch_size
        self.inp_shape = inp_shape
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.recurrent_units = recurrent_units
        
        # model components
        self.vision_module = self.buildVisionModule(inp_shape)
        self.speaker = self.buildSpeaker((self.seq_len, 1))
        self.quantizer = VectorQuantizer(self.vocab_size, self.seq_len)
        self.listener = self.buildListener((self.seq_len, self.vocab_size))
        
        # misc
        self.initial_speech = K.variable(value=np.zeros((self.batch_size, self.seq_len, 1)))
    
    def buildVisionModule(self, inp_shape):
        '''
        Vision Module - maps images to a vector with length recurrent_units.
        '''
        
        model = keras.models.Sequential(name='Vision_Module')
        model.add(L.Input(inp_shape))

        model.add(L.Conv2D(16, (2, 5), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.Conv2D(16, (2, 5), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.Conv2D(16, (2, 2), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.MaxPooling2D((2, 2)))

        model.add(L.Conv2D(16, (5, 2), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.Conv2D(16, (2, 5), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.Conv2D(16, (2, 2), padding='same'))
        model.add(L.LeakyReLU())
        model.add(L.MaxPooling2D((2, 2)))

        model.add(L.Flatten())
#         model.add(L.Dense(self.recurrent_units, activation='relu'))
        model.add(L.Dense(self.recurrent_units, activation='relu'))
        model.add(L.BatchNormalization())
        
        return model
        
    def buildSpeaker(self, inp_shape):
        '''
        Speaker - takes in initial speech vector and uses image
        vector as initial hidden state. Outputs a sequence of vectors
        that are quantized.
        '''
        
        init_speech = L.Input(inp_shape)
        init_cell = L.Input((self.recurrent_units,))
        init_hidden = L.Input((self.recurrent_units,))
        
        processed_init_cell = L.Dense(self.recurrent_units, activation="relu")(init_cell)
        processed_init_hidden = L.Dense(self.recurrent_units, activation="relu")(init_hidden)
        
        lstm1 = L.Bidirectional(L.LSTM(self.recurrent_units, return_sequences=True),
                                merge_mode='sum')(init_speech, initial_state=[processed_init_hidden, processed_init_cell,
                                                                              processed_init_hidden, processed_init_cell])
        lstm2 = L.Bidirectional(L.LSTM(self.recurrent_units, return_sequences=True),
                                merge_mode='sum')(lstm1, initial_state=[processed_init_hidden, processed_init_cell,
                                                                        processed_init_hidden, processed_init_cell])
        
        output = L.TimeDistributed(L.Dense(self.vocab_size, activation="softmax"))(lstm2)
        
        return keras.models.Model(inputs={'init_speech':init_speech, 
                                          'init_cell':init_cell,
                                          'init_hidden':init_hidden},
                                  outputs=output)

    def buildListener(self, inp_shape):
        '''
        Listener - takes in the quantized 'language' and outputs a single
        scalar probability.
        '''
        
        init_speech = L.Input(inp_shape)
        init_cell = L.Input((self.recurrent_units,))
        init_hidden = L.Input((self.recurrent_units,))
        
        processed_init_speech = L.TimeDistributed(L.Dense(self.recurrent_units))(init_speech)
        processed_init_cell = L.Dense(self.recurrent_units, activation="relu")(init_cell)
        processed_init_hidden = L.Dense(self.recurrent_units, activation="relu")(init_hidden)
        
        lstm1 = L.Bidirectional(L.LSTM(self.recurrent_units, return_sequences=True),
                                merge_mode='sum')(processed_init_speech, initial_state=[processed_init_hidden, processed_init_cell,
                                                                              processed_init_hidden, processed_init_cell])
        lstm3 = L.Bidirectional(L.LSTM(self.recurrent_units),
                                merge_mode='sum')(lstm1, initial_state=[processed_init_hidden, processed_init_cell,
                                                                        processed_init_hidden, processed_init_cell])
        
        predense = L.Dense(32, activation='relu')(lstm3)
        out = L.Dense(1, activation='sigmoid')(predense)
        
#         init_speech = L.Input(inp_shape)
#         init_state = L.Input((self.recurrent_units,))
#         gru = L.GRU(self.recurrent_units)(init_speech, initial_state=init_state)
#         predense = L.Dense(32, activation='relu')(gru)
#         out = L.Dense(1, activation='sigmoid')(predense)
        
        
        return keras.models.Model(inputs=[init_speech, init_cell, init_hidden],
                                  outputs=out)
    
    def get_sequence(self, inputs):
        
        # split data into half, Yegor-style
        half = len(inputs) // 2
        xa, xb = inputs[:half], inputs[half:]
        
        # get vision vectors
        vision_a = self.vision_module(xa)
        vision_b = self.vision_module(xb)
        
        # obtain spoken vectors {'init_speech':init_speech, 'init_state':init_state}
        spoken_a = self.speaker({'init_speech':self.initial_speech,
                                 'init_hidden':vision_a,# tf.random.normal((self.batch_size, self.recurrent_units,)),
                                 'init_cell':vision_a})
        spoken_b = self.speaker({'init_speech':self.initial_speech,
                                 'init_hidden':vision_b,# tf.random.normal((self.batch_size, self.recurrent_units,)),
                                 'init_cell':vision_b})
        
        return spoken_a, spoken_b
        
    
    def call(self, inputs, training=True):
        
        # split data into half, Yegor-style
        half = len(inputs) // 2
        xa, xb = inputs[:half], inputs[half:]
        
        # get vision vectors
        vision_a = self.vision_module(xa)
        vision_b = self.vision_module(xb)
        
        # obtain spoken vectors {'init_speech':init_speech, 'init_state':init_state}
        spoken_a = self.speaker({'init_speech':self.initial_speech,
                                 'init_hidden':vision_a,# tf.random.normal((self.batch_size, self.recurrent_units,)),
                                 'init_cell':vision_a})
        spoken_b = self.speaker({'init_speech':self.initial_speech,
                                 'init_hidden':vision_b,# tf.random.normal((self.batch_size, self.recurrent_units,)),
                                 'init_cell':vision_b})
        
        # quantize speech
        quantized_a = self.quantizer(spoken_a)
        quantized_b = self.quantizer(spoken_b)
        
        # obtain output vectors after listening
        listened_a = self.listener([quantized_a, vision_b, tf.random.normal((self.batch_size, self.recurrent_units,))])
        listened_b = self.listener([quantized_b, vision_a, tf.random.normal((self.batch_size, self.recurrent_units,))])
        
        return listened_a, listened_b

## Training

In [58]:
import tensorflow.keras.backend as K

'''
Core data parameters
'''
BATCH_SIZE = 512
SEQ_LEN = 3
VOCAB_SIZE = 9
RECURRENT_UNITS = 64

IMG_DIM = 128
MIN_SHAPES = 1
MAX_SHAPES = 3
SHAPE_TYPES = ['square', 'circle', 'triangle']
COLOR_TYPES = [(255,0,0), (0,255,  0), (0,0,255)]
OUTLINE = (255, 255, 255)
SHAPE_SCALE = 0.2

# create dataset
data = shapedata.AlecModeShapeData(batch_size=BATCH_SIZE, 
                                   im_size=IMG_DIM, 
                                   min_shapes=MIN_SHAPES, 
                                   max_shapes=MAX_SHAPES,
                                   outline = OUTLINE,
                                   shape_types = SHAPE_TYPES,
                                   shape_colors = COLOR_TYPES,
                                   shape_scale = SHAPE_SCALE)

# create relevant training artifacts
optimizer = tensorflow.keras.optimizers.Adam(learning_rate=1e-3)
bce = tensorflow.keras.losses.BinaryCrossentropy()
# mse = tensorflow.keras.losses.MeanSquaredError()
model = DLSM((IMG_DIM, IMG_DIM, 3), 
             seq_len=SEQ_LEN,
             vocab_size=VOCAB_SIZE,
             recurrent_units=RECURRENT_UNITS,
             batch_size=BATCH_SIZE)

# train batch function
@tf.function
def train_batch(x, y):
    with tf.GradientTape() as tape:
        half = len(y) // 2
        outa, outb = model(x, training=True)
        
        loss = tf.math.divide(tf.math.add(bce(y, outa), bce(y, outb)), 2) # use avg bce as loss
#         acc = tf.math.divide(tf.math.add(acc(y, outa), acc(y, outb)), 2)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss, (outa, outb, y)

In [59]:
shapedata.demo_dataset(data)

In [60]:
from sklearn.metrics import accuracy_score as acc

NUM_EPOCHS = 2000

losses, accs = [], []
for epoch in range(NUM_EPOCHS):
    (x1, x1_shapes), (x2, x2_shapes), y = data.create_batch()
    loss, (outa, outb, y) = train_batch(np.concatenate([x1, x2]), np.expand_dims(y,1))
    loss = loss.numpy()
    accuracy = (acc(np.round(outa), y) + acc(np.round(outb), y))/2
    print(f'BATCH {epoch} | Loss: {loss} | Acc: {accuracy}', end = '\r')
    losses.append(loss)
    accs.append(accuracy)
    
# plt.figure(figsize=(10, 5), dpi=400)
# plt.plot(losses, color='red')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.show()
# plt.close()

# plt.figure(figsize=(10, 5), dpi=400)
# plt.plot(accs, color='red')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.show()
# plt.close()

In [None]:
model.load_weights('weights.h5')

```
BATCH 1999 | Loss: 0.3106675446033478 | Acc: 0.880859375
```

In [13]:
model.save_weights('weights.h5')

In [None]:
# x = np.linspace(-5, 5, 1000)
# y = np.sin(x)
# plt.plot(x, y)
# plt.title('$\sin x$')
# plt.xlabel('time')
# plt.ylabel('my iq')
# plt.show()

# x = np.linspace(-5, 5, 1000)
# y = np.sin(x)
# plt.figure(dpi=500)
# plt.plot(x, y)
# plt.title('better plot of $\sin x$')
# plt.xlabel('time')
# plt.ylabel('my iq')
# plt.show()

In [61]:
testdata = shapedata.AlecModeShapeData(batch_size=BATCH_SIZE, 
                               im_size=IMG_DIM, 
                               min_shapes=MIN_SHAPES, 
                               max_shapes=MAX_SHAPES,
                               outline = OUTLINE,
                               shape_types = SHAPE_TYPES,
                               shape_colors = COLOR_TYPES,
                               shape_scale = SHAPE_SCALE)
(x1, x1_shapes), (x2, x2_shapes), y = testdata.create_batch()

In [62]:
speecha, speechb = model.get_sequence(np.concatenate([x1, x2]))

In [63]:
from tqdm.notebook import tqdm

In [68]:
quant = model.quantizer(speecha)

seqs = model.quantizer(quant)

seqs = np.array(seqs)
seqs = np.argmax(seqs, axis=2)

In [69]:
np.unique(seqs, axis=0)

In [25]:
for i, seq in enumerate(seqs):
    if (seq==[7,7,7]).all():
        plt.imshow(x1[i])
        plt.show()

In [24]:
print(np.argwhere(seqs==[7, 7, 7]))

In [21]:
len(np.unique(seqs, axis=0))

In [None]:
a, b = pred = model(np.concatenate([x1, x2]))
pred = (a + b)/2

print('CORRECT')

for i in range(len(x2)):
    
    if np.round(pred[i]) != np.round(y[i]):

        plt.figure(figsize=(10, 5), dpi=400)
        plt.subplot(1, 2, 1)
        plt.imshow(x1[i])
        plt.axis('off')
        plt.title(f'Pred: {pred[i]}')
        plt.subplot(1, 2, 2)
        plt.imshow(x2[i])
        plt.axis('off')
        plt.title(f'Truth: {y[i]}')
        plt.show()
        plt.close()
        
        
        
print('-'*500)
print('CORRECT')

for i in range(len(x2)):
    
    if np.round(pred[i]) == np.round(y[i]):

        plt.figure(figsize=(10, 5), dpi=400)
        plt.subplot(1, 2, 1)
        plt.imshow(x1[i])
        plt.axis('off')
        plt.title(f'Pred: {pred[i]}')
        plt.subplot(1, 2, 2)
        plt.imshow(x2[i])
        plt.axis('off')
        plt.title(f'Truth: {y[i]}')
        plt.show()
        plt.close()