In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pickle
import keras
import time
import os

from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from keras.preprocessing import text
from keras import preprocessing
from keras import regularizers
from keras import activations
from keras import optimizers
from keras import callbacks
from keras import layers
from keras import losses
from keras import models

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

Parameters

In [2]:
data_path = '../data/nn_data_1680619232.pickle'
dictionary_path = '../data/dictionary_1680619234.pickle'

maxlen=100
max_words=10000
first_split = 0.8
second_split=0.8

General functions

In [3]:
def plot_history(history):
    if "loss" not in history.history:
        print('Loss is missing in history')
        return
    
    colors = ['b', 'r', 'g', 'y', 'w']
    colors_dict = {}
    
    plt.figure(1)
    plt.title("History")
    plt.xlabel('epochs')
    plt.ylabel('metric values')
    
    epochs = range(1, len(history.history['loss']) + 1)
    index = 0
    for key_name, values in history.history.items():
        if 'val_' in key_name:
            plt.plot(epochs, values, colors_dict[key_name[4:]] + '.', label=key_name)
        else:
            colors_dict[key_name] = colors[index]
            plt.plot(epochs, values, colors[index], label=key_name)
            index += 1
        
    plt.legend()
    plt.show()

In [4]:
def print_evaluation(model, x_test, y_test, batch_size):
    loss, acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
    print(f"Test loss: {round(loss, 4)}")    
    print(f"Test acc: {round(acc, 4)}")

In [5]:
def load_data(data_path, dictionary_path):
    with open(data_path, 'rb') as file:
        data = pickle.load(file)
    
    with open(dictionary_path, 'rb') as file:
        dictionary = pickle.load(file)
        dictionary = {value: key for key, value in dictionary.items()}
        
    labels_count = len(dictionary)
    texts = []
    labels = []
    for text, categories in data:
        new_categories = np.zeros(labels_count, dtype='float32')
#         for category in categories:
#             new_categories[category] = 1.0
        new_categories[categories[0]] = 1.0
            
        texts.append(text)
        labels.append(new_categories)
        
        
    return texts, labels, dictionary

In [6]:
def prepare_data(texts, labels, maxlen=None, max_words=10000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    if maxlen is not None:
        data = pad_sequences(sequences, maxlen=maxlen)
    else:
        data = pad_sequences(sequences)

    return data, np.asarray(labels), tokenizer

In [7]:
def split_data(data, labels, first_split=0.8, second_split=0.8):
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=first_split)    
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size=second_split)
    return x_train, y_train, x_valid, y_valid, x_test, y_test

### Load and preprocess dataset

Load preprocessed yelp dataset and dictionary. The dictionary is then inverted

In [8]:
texts, labels, dictionary = load_data(data_path, dictionary_path)

Tokenize initially processed dataset

In [9]:
data, labels, tokenizer = prepare_data(texts, labels, maxlen=maxlen, max_words=max_words)
texts = None

Split the data to train, valid and test datasets

In [10]:
x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(data, labels, first_split=first_split, second_split=second_split)
data = None
labels = None

Save tokenizer

In [11]:
with open(f'../data/tokenizer_{int(time.time())}.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
tokenizer = None

Print information about the datasets

In [12]:
print(f'Training samples:   {x_train.shape[0]}')
print(f'Testing samples:    {x_test.shape[0]}')
print(f'Validating samples: {x_valid.shape[0]}')

Training samples:   1537487
Testing samples:    480465
Validating samples: 384372


### Create Neural Network Model

Model parameters

In [13]:
model_name = 'tc_model_2'
embedding_dim = 25
batch_size = 1024
epochs = 10

Create model architecture

In [14]:
model = models.Sequential([
    # embedding layer
    layers.Embedding(max_words, embedding_dim, input_length=len(x_train[0])),
    
    # 1st convolutional layer
    layers.Conv1D(64, 3, activation=layers.PReLU(), padding='same'),
    layers.AveragePooling1D(3),
    
    # GRU layers
    layers.GRU(64, return_sequences=True),
    layers.GRU(64),
#     , kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4)
    
    # flatten
    layers.Flatten(),
    
    # 1st dense layer
    layers.Dense(len(dictionary), activation=layers.PReLU()),
    
    # last layer
    layers.Dense(len(dictionary), activation='softplus')
])


# compile model
model.compile(
    loss=losses.CategoricalCrossentropy(),
    optimizer=optimizers.Nadam(),
    metrics=['acc']
)

# print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 25)           250000    
                                                                 
 conv1d (Conv1D)             (None, 100, 64)           11264     
                                                                 
 average_pooling1d (AverageP  (None, 33, 64)           0         
 ooling1D)                                                       
                                                                 
 gru (GRU)                   (None, 33, 64)            24960     
                                                                 
 gru_1 (GRU)                 (None, 64)                24960     
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                        

Train the network

In [15]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=epochs, 
    batch_size=batch_size,
    validation_data=(x_valid, y_valid),
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

InternalError: Could not synchronize CUDA stream: CUDA_ERROR_LAUNCH_TIMEOUT: the launch timed out and was terminated

Save the trained model

In [None]:
model.save(f'../models/{model_name}_{int(time.time())}.h5')

### Visualize training results

In [None]:
plot_history(history)

In [None]:
print_evaluation(model, x_test, y_test, batch_size=batch_size)