# Autoregressive Abstracter: Card Sorting

## Set Up

In [2]:
# %%capture
# !git clone https://github.com/jdlafferty/relational.git
# %cd relational
# !git branch awni-dev
# !git checkout awni-dev
# !git pull origin awni-dev

# !pip install pydealer
# !pip install wandb --upgrade
# !pip install tensorflow --upgrade

In [3]:
import pydealer
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn.metrics

from hand2hand import Cards
import utils

2023-01-22 00:19:32.859273: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-22 00:19:33.033461: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:
2023-01-22 00:19:33.033521: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-22 00:19:34.418320: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [4]:
%env "WANDB_NOTEBOOK_NAME" "autoregressive_abstracter_hand_sorting.ipynb"

import wandb
wandb.login(key='283ce55537fabf61a55a960f2788ffcbf12a5b46')

env: "WANDB_NOTEBOOK_NAME"="autoregressive_abstracter_hand_sorting.ipynb"


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
def create_callbacks(monitor='loss', log_gradients=False, save_model=True, log_weights=True,
                     train_ds=None, val_ds=None, ):
    callbacks = [
#         tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='auto', restore_best_weights=True),
#         tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto'),
        wandb.keras.WandbMetricsLogger(log_freq='epoch'),
        wandb.keras,WandbModelCheckpoint(monitor=monitor, mode='auto', save_freq='epoch')
#         wandb.keras.WandbCallback(
#             monitor=monitor, log_weights=log_weights, log_gradients=log_gradients, save_model=save_model, save_graph=True,
#             training_data=train_ds, validation_data=val_ds,
#             labels=class_names, predictions=64, compute_flops=True)
        ]
    return callbacks

# metrics = [
#         tf.keras.metrics.BinaryAccuracy(name='acc'),
#         tf.keras.metrics.Precision(class_id=1, name='precision'),
#         tf.keras.metrics.Recall(class_id=1, name='recall'),
#         tf.keras.metrics.AUC(curve='ROC', multi_label=True, name='auc')
#         ]

# loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
# opt = tf.keras.optimizers.Adam()

In [6]:
import tensorflow as tf
import seq2seq_transformer
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, Model

## [TMP] Re-define `Decoder`

The decoder is modified so that it's target sequence input is an arbitrary sequence of vectors rather than a sequence of tokens (so that embedding and adding positional encoding can be done separatly). This makes it more modular and compatible with `SymbolicDecoder` and `EpisodicDecoder`.

In [7]:
from seq2seq_transformer import AddPositionalEmbedding, DecoderLayer

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, num_heads, dff,
               dropout_rate=0.1, name='decoder'):
        super(Decoder, self).__init__(name=name)

        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.dropout_rate = dropout_rate

    def build(self, input_shape):

        _, self.sequence_length, self.d_model = input_shape

        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)

        self.dec_layers = [
            DecoderLayer(d_model=self.d_model, num_heads=self.num_heads,
                         dff=self.dff, dropout_rate=self.dropout_rate)
            for _ in range(self.num_layers)]

        self.last_attn_scores = None

    def call(self, x, encoder_context):

        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, encoder_context)

#             self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        return x

## Dataset

In [8]:
hand_size = 7

deck = Cards()
pydeck = pydealer.Deck()
pydeck.shuffle()

In [9]:
n = 10000
BEGIN_HAND = 52 # token for 'beginning of hand'
END_HAND = 53 # token for 'end of hand'

hands = np.array(n*(hand_size+2)*[0]).reshape(n, hand_size+2)
hands_sorted = np.array(n*(hand_size+2)*[0]).reshape(n, hand_size+2)

for i in np.arange(n):
    hand = pydeck.deal(hand_size)
    if len(hand) < hand_size:
        #print('shuffling deck')
        pydeck = pydealer.Deck()
        pydeck.shuffle()
        hand = pydeck.deal(hand_size)
    source = list(deck.index_pyhand(hand))
    source.insert(0,BEGIN_HAND)
    source.append(END_HAND)
    hands[i] = np.array(source)
    deck.sort_pyhand(hand)
    target = list(deck.index_pyhand(hand))
    target.insert(0,BEGIN_HAND)
    target.append(END_HAND)
    hands_sorted[i] = np.array(target)


In [10]:
hands_train, hands_test, sorted_train, sorted_test = train_test_split(hands, hands_sorted, test_size=0.25)

source_train = hands_train
target_train = sorted_train[:,:-1]
labels_train = sorted_train[:,1:]

source_test = hands_test
target_test = sorted_test[:,:-1]
labels_test = sorted_test[:,1:]

In [11]:
def evaluate_seq2seq_model(model):
    n = len(source_test)
    output = np.zeros(n*(hand_size+2), dtype=int).reshape(n,hand_size+2)
    output[:,0] = BEGIN_HAND
    for i in range(hand_size+1):
        predictions = model((source_test, output[:, :-1]), training=False)
        predictions = predictions[:, i, :]
        predicted_id = tf.argmax(predictions, axis=-1)
        output[:,i+1] = predicted_id

    acc = (np.sum(output[:,1:] == labels_test))/np.prod(labels_test.shape)
    print('per-card accuracy: %.2f%%' % (100*acc))
    
    return acc

## Standard Transformer

In [12]:
# from seq2seq_transformer import Encoder

# inputs = layers.Input(shape=(9,), name='token_input')
# target = layers.Input(shape=(8,), name='token_target')

# token_embedder = layers.Embedding(54, 128, name='vector_embedding')
# pos_embedding_adder_input = AddPositionalEmbedding(name='add_pos_embedding_input')
# pos_embedding_adder_target = AddPositionalEmbedding(name='add_pos_embedding_target')
# encoder = Encoder(num_layers=3, num_heads=2, dff=64, dropout_rate=0.1, name='transformer_encoder')

# decoder = Decoder(num_layers=3, num_heads=2, dff=64, dropout_rate=0.1, name='transformer_decoder')

# x = token_embedder(inputs)
# x = pos_embedding_adder_input(x)

# encoder_context = encoder(x)

# target_embedding = token_embedder(target)
# target_embedding = pos_embedding_adder_target(target_embedding)

# x = decoder(target_embedding, encoder_context)

# x = layers.Dense(54)(x)

# transformer = Model(inputs=[inputs, target], outputs=x)

In [13]:
from seq2seq_transformer import Encoder

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, num_heads, dff,
            input_vocab_size, target_vocab_size, embedding_dim,
            dropout_rate=0.1, name='transformer'):
        super().__init__(name=name)
        
        self.token_embedder = layers.Embedding(input_vocab_size, embedding_dim, name='vector_embedding')
        
        self.pos_embedding_adder_input = AddPositionalEmbedding(name='add_pos_embedding_input')
        self.pos_embedding_adder_target = AddPositionalEmbedding(name='add_pos_embedding_target')

        self.encoder = Encoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='encoder')
        self.decoder = Decoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='decoder')
        self.final_layer = layers.Dense(target_vocab_size, name='final_layer')


    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        source, target  = inputs
        
        x = self.token_embedder(source)
        x = self.pos_embedding_adder_input(x)

        encoder_context = self.encoder(x)

        target_embedding = self.token_embedder(target)
        target_embedding = self.pos_embedding_adder_target(target_embedding)

        x = self.decoder(target_embedding, encoder_context)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
          # Drop the keras mask, so it doesn't scale the losses/metrics.
          # b/250038731
          del logits._keras_mask
        except AttributeError:
          pass

        # Return the final output and the attention weights.
        return logits

In [14]:
transformer = Transformer(num_layers=2, num_heads=2, dff=64, 
    input_vocab_size=54, target_vocab_size=54, embedding_dim=128)

2023-01-22 00:19:42.210590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:
2023-01-22 00:19:42.210637: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-22 00:19:42.210676: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (d6744085c45f): /proc/driver/nvidia/version does not exist
2023-01-22 00:19:42.211229: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other 

In [15]:
from seq2seq_transformer import masked_loss, masked_accuracy

# opt.build(transformer.trainable_variables)
transformer.compile(loss=masked_loss, optimizer=tf.keras.optimizers.Adam(), metrics=masked_accuracy)
transformer((source_train, target_train))

transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vector_embedding (Embedding  multiple                 6912      
 )                                                               
                                                                 
 add_pos_embedding_input (Ad  multiple                 0         
 dPositionalEmbedding)                                           
                                                                 
 add_pos_embedding_target (A  multiple                 0         
 ddPositionalEmbedding)                                          
                                                                 
 encoder (Encoder)           multiple                  298112    
                                                                 
 decoder (Decoder)           multiple                  562560    
                                                       

In [16]:
transformer.fit((source_train, target_train), labels_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff0ac027590>

In [17]:
evaluate_seq2seq_model(transformer);

per-card accuracy: 94.40%


## Autoregressive Abstracter with 'Symbolic' Cross-Attention $(Q=A, K=E, V=A)$

In [20]:
from symbolic_decoder import SymbolicDecoder

class AutoregressiveSymbolicAbstracter(tf.keras.Model):
    def __init__(self, num_layers, num_heads, dff,
            input_vocab_size, target_vocab_size, embedding_dim,
            dropout_rate=0.1, name='transformer'):
        super().__init__(name=name)
        
        self.token_embedder = layers.Embedding(input_vocab_size, embedding_dim, name='vector_embedding')
        
        self.pos_embedding_adder_input = AddPositionalEmbedding(name='add_pos_embedding_input')
        self.pos_embedding_adder_target = AddPositionalEmbedding(name='add_pos_embedding_target')

        self.encoder = Encoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='encoder')
        self.abstracter = SymbolicDecoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='abstracter')
        self.decoder = Decoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='decoder')
        self.final_layer = layers.Dense(target_vocab_size, name='final_layer')


    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        source, target  = inputs
        
        x = self.token_embedder(source)
        x = self.pos_embedding_adder_input(x)

        encoder_context = self.encoder(x)

        abstracted_context = self.abstracter(encoder_context)
        
        target_embedding = self.token_embedder(target)
        target_embedding = self.pos_embedding_adder_target(target_embedding)

        x = self.decoder(target_embedding, abstracted_context)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
          # Drop the keras mask, so it doesn't scale the losses/metrics.
          # b/250038731
          del logits._keras_mask
        except AttributeError:
          pass

        # Return the final output and the attention weights.
        return logits

In [21]:
autoregressive_symbolic_abstracter = AutoregressiveSymbolicAbstracter(num_layers=2, num_heads=2, dff=64, 
    input_vocab_size=54, target_vocab_size=54, embedding_dim=128)

In [22]:
from seq2seq_transformer import masked_loss, masked_accuracy

autoregressive_symbolic_abstracter.compile(loss=masked_loss, optimizer=tf.keras.optimizers.Adam(), metrics=masked_accuracy)
autoregressive_symbolic_abstracter((source_train, target_train))

autoregressive_symbolic_abstracter.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vector_embedding (Embedding  multiple                 6912      
 )                                                               
                                                                 
 add_pos_embedding_input (Ad  multiple                 0         
 dPositionalEmbedding)                                           
                                                                 
 add_pos_embedding_target (A  multiple                 0         
 ddPositionalEmbedding)                                          
                                                                 
 encoder (Encoder)           multiple                  298112    
                                                                 
 abstracter (SymbolicDecoder  multiple                 563712    
 )                                                     

In [23]:
autoregressive_symbolic_abstracter.fit((source_train, target_train), labels_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff0acda1c50>

In [24]:
evaluate_seq2seq_model(autoregressive_symbolic_abstracter);

per-card accuracy: 15.85%


## Autoregressive Abstracter with 'Episodic' Cross-Attention $(Q=E, K=E, V=E)$

In [25]:
from seq2seq_transformer import EpisodicDecoder

class AutoregressiveEpisodicAbstracter(tf.keras.Model):
    def __init__(self, num_layers, num_heads, dff,
            input_vocab_size, target_vocab_size, embedding_dim,
            dropout_rate=0.1, name='autoregressive_episodic_abstracter'):
        super().__init__(name=name)
        
        self.token_embedder = layers.Embedding(input_vocab_size, embedding_dim, name='vector_embedding')
        
        self.pos_embedding_adder_input = AddPositionalEmbedding(name='add_pos_embedding_input')
        self.pos_embedding_adder_target = AddPositionalEmbedding(name='add_pos_embedding_target')

        self.encoder = Encoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='encoder')
        self.abstracter = EpisodicDecoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='abstracter')
        self.decoder = Decoder(num_layers=num_layers, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate, name='decoder')
        self.final_layer = layers.Dense(target_vocab_size, name='final_layer')


    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        source, target  = inputs
        
        x = self.token_embedder(source)
        x = self.pos_embedding_adder_input(x)

        encoder_context = self.encoder(x)

        abstracted_context = self.abstracter(encoder_context)
        
        target_embedding = self.token_embedder(target)
        target_embedding = self.pos_embedding_adder_target(target_embedding)

        x = self.decoder(target_embedding, abstracted_context)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
          # Drop the keras mask, so it doesn't scale the losses/metrics.
          # b/250038731
          del logits._keras_mask
        except AttributeError:
          pass

        # Return the final output and the attention weights.
        return logits

In [26]:
autoregressive_episodic_abstracter = AutoregressiveEpisodicAbstracter(num_layers=2, num_heads=2, dff=64, 
    input_vocab_size=54, target_vocab_size=54, embedding_dim=128)

In [27]:
from seq2seq_transformer import masked_loss, masked_accuracy, CustomSchedule

learning_rate = CustomSchedule(d_model=128)
autoregressive_episodic_abstracter.compile(
    loss=masked_loss, optimizer=tf.keras.optimizers.Adam(learning_rate), metrics=masked_accuracy)
autoregressive_episodic_abstracter((source_train, target_train))

autoregressive_episodic_abstracter.summary()

Model: "autoregressive_episodic_abstracter"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vector_embedding (Embedding  multiple                 6912      
 )                                                               
                                                                 
 add_pos_embedding_input (Ad  multiple                 0         
 dPositionalEmbedding)                                           
                                                                 
 add_pos_embedding_target (A  multiple                 0         
 ddPositionalEmbedding)                                          
                                                                 
 encoder (Encoder)           multiple                  298112    
                                                                 
 abstracter (EpisodicDecoder  multiple                 563712    
 )                              

In [28]:
autoregressive_episodic_abstracter.fit((source_train, target_train), labels_train,
    epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff08d17bc50>

In [29]:
evaluate_seq2seq_model(autoregressive_episodic_abstracter);

per-card accuracy: 91.11%


## Multi-Abstracter Model

$$\text{Encoder} \to \text{Abstracter} \to \cdots \to \text{Abstracter} \to \text{Decoder}$$

...