# Exercise 11: Attention & Transformers with Keras

In [None]:
# Load packages we need
import sys
import os

import datetime

import numpy as np
import sklearn

import scipy as sp
import pandas as pd

import tensorflow as tf

# we'll use keras for neural networks
import tensorflow.keras as keras
from tensorflow.keras.datasets import fashion_mnist

# import layers we will use
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, SimpleRNN, GRU

# import callbacks we will use
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

# Load the TensorBoard notebook extension
%load_ext tensorboard

%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 18})

# Let's check our software versions
print('### Python version: ' + sys.version)
print('### Numpy version: ' + np.__version__)
print('### Scikit-learn version: ' + sklearn.__version__)
print('### Tensorflow version: ' + tf.__version__)
print('------------')


# load our packages / code
sys.path.insert(1, '../common/')
import utils
import plots

In [None]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.

seed = 42 # deterministic seed
np.random.seed(seed) 
tf.random.set_seed(seed)

prop_vec = [24, 2, 2]

## Training a Character-level RNN

### For this we'll use the text of Wizard of Oz books

In [None]:
def split_data_seq_target(seq_array, window_size, slide=1):
    seq_length = seq_array.shape[0]
    num_examples_slide1 = seq_length - window_size
    x = np.zeros((num_examples_slide1, window_size), dtype=np.uint8)
    y = np.zeros((num_examples_slide1,1), dtype=np.uint8)
    idx = 0
    for i in range(0, num_examples_slide1, slide):
        x[idx,:] = seq_array[i:i+window_size]
        y[idx] = seq_array[i+window_size]
        idx += 1

    return x[:idx], y[:idx]

def to_array(tokenizer, input_string_array, verbose=0):
    # encode as an sequence (array) of integers
    seq_list = tokenizer.texts_to_sequences(input_string_array)
    # remap to 0 to max_id -1

    encoded_array = np.array(seq_list[0], dtype=np.uint8) - 1 # subtract 1 because indices start at 1
    if verbose:
        print(encoded_array, encoded_array.shape, np.amin(encoded_array), np.amax(encoded_array))
    return encoded_array

def to_str(tokenizer, array):
     return tokenizer.sequences_to_texts(array + 1) # add 1 because indices start at 1

def load_preprocess_data(fp = '../data/oz-data.txt', window_size=150, verbose=0):
    with open(fp) as f:
        input_text = f.read()

    tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, lower=False)
    tokenizer.fit_on_texts(input_text)

    num_classes = len(tokenizer.word_index)
    
    # encode as an sequence (array) of integers
    seq_array = to_array(tokenizer, [input_text], verbose)
    
    # split into windows
    x, y = split_data_seq_target(seq_array, window_size, slide=1)
    
    return x, y, int(num_classes), tokenizer

### We want to split this data into train, val, test 

### What could go wrong if we split randomly (e.g., shuffle x & y, then split)?

In [None]:
def train_test_split_seq(x, y, prop_vec=prop_vec, verbose=0):
    # instead we take the data in order
    n_tr = int(prop_vec[0] / np.sum(prop_vec) * x.shape[0])
    n_val = int(prop_vec[1] / np.sum(prop_vec) * x.shape[0])
    train_x = x[:n_tr]
    train_y = y[:n_tr]
    val_x = x[n_tr:n_tr+n_val]
    val_y = y[n_tr:n_tr+n_val]
    test_x = x[n_tr+n_val:]
    test_y = y[n_tr+n_val:]

    if verbose:
        print(train_x.shape, train_y.shape, val_x.shape, val_y.shape, test_x.shape, test_y.shape)
        
    return train_x, train_y, val_x, val_y, test_x, test_y

In [None]:
window_size = 150
x, y, num_classes, tokenizer = load_preprocess_data(window_size=window_size)
train_x, train_y, val_x, val_y, test_x, test_y = train_test_split_seq(x, y)

In [None]:
print(train_x.shape, train_y.shape)

### We need to one-hot encode the data

In [None]:
def make_ds_and_onehot(x, y, num_classes, batch_size=100, prefetch_size=10):
    ds = tf.data.Dataset.from_tensor_slices(np.c_[x, y])
    ds = ds.map(lambda batch_xy: (batch_xy[:-1], batch_xy[-1]))
    ds = ds.map(lambda batch_x, batch_y: (tf.one_hot(batch_x, depth=num_classes), batch_y))   
    
    # shuffle, batch, and prefetch
    ds = ds.shuffle(4096).batch(batch_size)
    ds = ds.prefetch(prefetch_size)
    return ds

In [None]:
ds_train = make_ds_and_onehot(train_x, train_y, num_classes)
ds_test = make_ds_and_onehot(test_x, test_y, num_classes)
ds_val = make_ds_and_onehot(val_x, val_y, num_classes)

In [None]:
ds_train

In [None]:
for x, y in ds_train.take(2):
    print(x, y)

In [None]:
print(ds_train)

### Let's create a model

In [None]:
def create_compile_rnn(input_shape=(None, num_classes), dropout_rate=0.175, verbose=True):
    name = 'CharLevel-RNN'

    model = keras.models.Sequential(name=name)

    model.add(keras.Input(shape=input_shape, sparse=False, name='input')) 
    
    model.add(GRU(192, return_sequences=True, dropout=dropout_rate, recurrent_dropout=0.0, name='gru1'))
    model.add(GRU(128, recurrent_dropout=0.0, name='gru2'))
    
    # output
    model.add(Dense(num_classes, activation='softmax', name='output'))
    
    if verbose:
        model.summary()
        
    opt = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    return model

In [None]:
model_fp = './charlevel-rnn.h5'

train = False
#train = True

if train:
    model = create_compile_rnn()
    
    num_epochs = 20
    history = model.fit(ds_train, validation_data=ds_val, epochs=num_epochs, callbacks=[])
    
    model.save(model_fp) # save the model
else:
    assert os.path.exists(model_fp), 'Train the model first!'
    
    model = keras.models.load_model(model_fp)

In [None]:
def create_prompt(prompt):
    prompt_array = to_array(tokenizer, prompt).reshape(len(prompt), -1)
    return tf.one_hot(prompt_array, depth=num_classes)

In [None]:
prompt = create_prompt(['Doroth'])
prompt_pred = np.argmax(model.predict(prompt, verbose=0), axis=-1)

In [None]:
to_str(tokenizer, prompt_pred.reshape(-1, 1))

In [None]:
model.predict(prompt, verbose=0)

In [None]:
tokenizer.word_index

### Let's generate some text

In [None]:
def sample_from_model(model, prompt_str, out_len=50, temp=1.0):
    res = ''
    for i in range(0, out_len):
        prompt = create_prompt([prompt_str + res])
        
        # get the logits and compute softmax probabilities
        prob_vec = model.predict(prompt, verbose=0).reshape(-1,)
        logits_vec = np.log(prob_vec)/temp
        sample_probas = np.exp(logits_vec)
        sample_probas = sample_probas / np.sum(sample_probas)
        
        # use numpy to sample index according to sample_probas
        choice_idx = np.random.choice(np.arange(0, sample_probas.shape[0]), size=1, p=sample_probas)
        
        chosen_char = to_str(tokenizer, np.array([choice_idx]))[0]
        res += chosen_char
    return res

In [None]:
prompt_str = 'Dorothy said'
out_str = sample_from_model(model, prompt_str, out_len=250, temp=0.1)

In [None]:
print(prompt_str + out_str)

### Can we train a character-level prediction model using the Transformer architecture??
### Transformer paper: https://arxiv.org/pdf/1706.03762.pdf
# ![Transformer](https://upload.wikimedia.org/wikipedia/commons/8/8f/The-Transformer-model-architecture.png)
### (Image source: wikipedia.)

In [None]:
""" let's create a custom Keras layer to implement a transformer layer/block.
"""
from keras.layers import MultiHeadAttention, LayerNormalization, Dense, Dropout

class TransformerWithMHALayer(keras.layers.Layer):
    
    def __init__(self, num_heads, dense_units, embedding_size,  
                 attention_dropout=0.0, dense_dropout=0.1, dense_activation='relu'):
        super().__init__() # super init

        # let's instantiate the layers/components and store them.
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_size, dropout=attention_dropout) 

        # we need two of these one for each add-norm
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()

        # need a dropout layer after dense
        self.dropout = Dropout(dense_dropout)

        # feedforward part - dense followed by dense 
        self.dense1 = Dense(dense_units, activation=dense_activation)
        self.dense2 = Dense(embedding_size, activation='linear') # embedding_size output (so shapes work and we can stack transformer layers)
        
        self.dropout2 = Dropout(dense_dropout)


    """ This is invoked during the forward pass. It needs to implement the forward pass functionality of a MHA transformer layer.
    """
    def call(self, inputs):
        # what we need to do. 1. MHA, 2. layernorm with residual connection (add-norm), 3. feedforward (dense, dense), 4. dropout, 5. add-norm
        attention_output = self.mha(inputs, inputs) # 1. MHA
        
        first_addnorm = self.layernorm1(inputs + attention_output) # 2. note the residual connection ('inputs')

        ff1 = self.dense1(first_addnorm) # 3. feedforward
        ff2 = self.dense2(ff1)
        ffdropout = self.dropout(ff2)   # 4. dropout

        second_addnorm = self.layernorm2(first_addnorm + ffdropout) # 5. again note the residual connection ('first_addnorm')

        return second_addnorm # this is the output of the layer

## Keras has a bunch of examples/tutorials for doing various things with transformers. For example see: https://keras.io/examples/nlp/

In [None]:
# Let's implement the create compile for our transformer.

from keras.layers import Input, GlobalMaxPool1D, Embedding

def create_compile_transformer(max_prompt_len, num_classes, embedding_size=96, num_attention_heads=6, 
                                     dense_units=48, verbose=True):
    name = 'CharLevel-Transformer'

    inputs = Input(shape=(max_prompt_len))

    # This is just a simple (char) embedding, ideally we should do some kind of positional embedding instead
    # TODO: replace this embedding with (char and) positional embedding
    # e.g., see: https://keras.io/api/keras_nlp/modeling_layers/token_and_position_embedding/
    # and https://keras.io/api/keras_nlp/modeling_layers/position_embedding/
    embeddings = Embedding(num_classes, embedding_size)(inputs)
    
    transformer1 = TransformerWithMHALayer(num_attention_heads, dense_units, embedding_size)(embeddings)
    
    # we could stack additional transformer layers for example:
    # transformer2 = TransformerWithMHALayer(embedding_size, num_attention_heads, ff_dense_units)(transformer1)

    maxpool = GlobalMaxPool1D()(transformer1)
    dropout1 = Dropout(0.125)(maxpool)
    dense2 = Dense(90, activation='relu')(dropout1)
    dropout2 = Dropout(0.125)(dense2)
    outputs = Dense(num_classes, activation='softmax')(dropout2)

    model = keras.Model(inputs=inputs, outputs=outputs, name=name)

    if verbose:
        model.summary()

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [None]:
# Let's create the model to see the summary
_ = create_compile_transformer(window_size, num_classes, verbose=True)

### (At home) exercise: complete the notebook
### [TODO] 1. Add positional embedding (note: it may work somewhat without it if window size is small enough)
### [TODO] 2. Add some training code
### [TODO] 3. To sample you will need to change the way to get predictions from the model and sample from it
### In particular note that we are not using one-hot encoding over the tokens (char)