In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pickle
import keras
import time
import os

from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from keras.preprocessing import text
from keras import preprocessing
from keras import regularizers
from keras import activations
from keras import optimizers
from keras import callbacks
from keras import layers
from keras import losses
from keras import models

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

Parameters

In [2]:
data_path = '../data/large_nn_data_1681467705.pickle'
dictionary_path = '../data/large_dictionary_1681467705.pickle'
tokenizer_path = '../models/tokenizer_1681467787.pickle'  # if none the new one is created

maxlen=500
max_words=20000
first_split = 0.9
second_split=0.9

General functions

In [3]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    epochs = range(1, len(history.history[loss_list[0]]) + 1)
    
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [4]:
def print_evaluation(model, x_test, y_test, batch_size):
    loss = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
    print(f"Test loss: {round(loss, 4)}")    

In [5]:
def load_data(data_path, dictionary_path):
    with open(data_path, 'rb') as file:
        data = pickle.load(file)
    
    with open(dictionary_path, 'rb') as file:
        dictionary = pickle.load(file)
        dictionary = {value: key for key, value in dictionary.items()}
        
    labels_count = len(dictionary)
    texts = []
    labels = []
    for text, category in data:
        new_categories = np.zeros(labels_count, dtype='float32')
        new_categories[category] = 1.0
        texts.append(text)
        labels.append(new_categories)
        
        
    return texts, labels, dictionary

In [6]:
def prepare_data(texts, labels, t_path=None, maxlen=None, max_words=10000):
    if t_path is None:
        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(texts)
    else:
        with open(t_path, 'rb') as file:
            tokenizer = pickle.load(file)
    
    sequences = tokenizer.texts_to_sequences(texts)
    if maxlen is not None:
        data = pad_sequences(sequences, maxlen=maxlen)
    else:
        data = pad_sequences(sequences)

    return data, np.asarray(labels), tokenizer

In [7]:
def split_data(data, labels, first_split=0.8, second_split=0.8):
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=first_split)    
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size=second_split)
    return x_train, y_train, x_valid, y_valid, x_test, y_test

### Load and preprocess dataset

Load preprocessed yelp dataset and dictionary. The dictionary is then inverted

In [8]:
texts, labels, dictionary = load_data(data_path, dictionary_path)

Tokenize initially processed dataset

In [9]:
data, labels, tokenizer = prepare_data(texts, labels, tokenizer_path, maxlen=maxlen, max_words=max_words)
texts = None

Split the data to train, valid and test datasets

In [10]:
x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(data, labels, first_split=first_split, second_split=second_split)
data = None
labels = None

Save tokenizer

In [11]:
if tokenizer_path is None:
    tokenizer_time = int(time.time())
    with open(f'../models/tokenizer_{tokenizer_time}.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    tokenizer_time = int(tokenizer_path.split('tokenizer_')[-1][0:-7])
tokenizer = None

Print information about the datasets

In [12]:
print(f'Training samples:   {x_train.shape[0]}')
print(f'Testing samples:    {x_test.shape[0]}')
print(f'Validating samples: {x_valid.shape[0]}')

Training samples:   240823
Testing samples:    29732
Validating samples: 26759


### Train multi-label model v1

In [13]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, ff_act='relu', ff_reg=None, ff_d=0.25, mh_reg=None, mh_d=0.1, norm_eps=1e-6, **kwargs):
        # initialize super class
        super(TransformerBlock, self).__init__(**kwargs)
        
        # multi head attention
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim,
            kernel_regularizer=mh_reg,
            dropout=mh_d
        )
        
        # feed forward network
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation=ff_act, kernel_regularizer=ff_reg), 
            layers.Dense(embed_dim, kernel_regularizer=ff_reg)
        ])
        
        # layer normalizations
        self.layernorm1 = layers.LayerNormalization(epsilon=norm_eps)
        self.layernorm2 = layers.LayerNormalization(epsilon=norm_eps)
        
        # dropout layers
        self.dropout1 = layers.Dropout(ff_d)
        self.dropout2 = layers.Dropout(ff_d)
        
        # remember for serialization
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.ff_act=ff_act
        self.ff_reg=ff_reg
        self.ff_d=ff_d
        self.mh_reg=mh_reg
        self.mh_d=mh_d
        self.norm_eps=norm_eps
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "ff_act": self.ff_act,
            "ff_reg": self.ff_reg,
            "ff_d": self.ff_d,
            "mh_reg": self.mh_reg,
            "mh_d": self.mh_d,
            "norm_eps": self.norm_eps
        })
        return config

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [14]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embed_reg=None, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        
        # embedding layers
        self.token_emb = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            embeddings_regularizer=embed_reg
        )
        self.pos_emb = layers.Embedding(
            input_dim=maxlen, 
            output_dim=embed_dim,
            embeddings_regularizer=embed_reg
        )
        
        # save for serialization
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_reg = embed_reg

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "maxlen": self.maxlen,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
            "embed_reg": self.embed_reg
        })
        return config

In [15]:
model_name = 'tc_ml_model_1'
embedding_dim = 32
number_heads = 4
feed_forward_dim = 32
batch_size = 128
epochs = 5

In [20]:
# input layer
inputs = layers.Input(shape=(maxlen,))

# position encoding & embedding
embedding_layer = TokenAndPositionEmbedding(
    maxlen = maxlen, 
    vocab_size = max_words, 
    embed_dim = embedding_dim, 
    embed_reg = None
)
x = embedding_layer(inputs)

# multi head attention & feed forward
transformer_block = TransformerBlock(
    embed_dim = embedding_dim, 
    num_heads = number_heads, 
    ff_dim = feed_forward_dim, 
    ff_act = 'relu', 
    ff_reg = None,
    ff_d = 0.25, 
    mh_reg = None, 
    mh_d = 0.25, 
    norm_eps = 1e-3
)
x = transformer_block(x)

# # global average pooling & normalization
x = layers.GlobalAveragePooling1D()(x)
x = layers.LayerNormalization()(x)
x = layers.Dropout(0.25)(x)

# last layer
outputs = layers.Dense(len(dictionary), activation='sigmoid', kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=1e-4))(x)

# creating model
model = keras.Model(inputs=inputs, outputs=outputs)

# compile model
model.compile(
    loss='mse',
    optimizer=optimizers.Nadam(learning_rate=0.001, clipvalue=1.0, clipnorm=1.0),
)

# print model summary
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 500)]             0         
                                                                 
 token_and_position_embeddin  (None, 500, 32)          656000    
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_2 (Transf  (None, 500, 32)          19040     
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_2   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 layer_normalization_8 (Laye  (None, 32)               64  

In [21]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=epochs,  
    batch_size=batch_size,
    validation_data=(x_valid, y_valid),
    callbacks=[
        callbacks.ModelCheckpoint('../models/'+model_name+'_'+str(int(time.time()))+'_tt_'+str(tokenizer_time)+'_{epoch:02d}_{val_loss:.4f}.h5', monitor='val_loss', verbose=0, save_weights_only=False, save_best_only=True, mode='min', initial_value_threshold=0.05)
    ]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/model_2/transformer_block_2/multi_head_attention_2/einsum_1/Einsum' defined at (most recent call last):
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Kamil\AppData\Local\Temp\ipykernel_18296\1383914482.py", line 1, in <module>
      history = model.fit(
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 576, in minimize
      grads_and_vars = self._compute_gradients(
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 634, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "C:\Users\Kamil\miniconda3\envs\tensorflow\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/model_2/transformer_block_2/multi_head_attention_2/einsum_1/Einsum'
OOM when allocating tensor with shape[128,4,500,500] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/model_2/transformer_block_2/multi_head_attention_2/einsum_1/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_13754]

In [24]:
model.save(f'../models/{model_name}_{int(time.time())}_tt_{tokenizer_time}.h5')

### Visualize model v1 training results

In [22]:
plot_history(history)

NameError: name 'history' is not defined

In [23]:
print_evaluation(model, x_test, y_test, batch_size=batch_size)

Test loss: 0.0178
