In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pickle
import keras
import time
import os

from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras_preprocessing import sequence
from keras.preprocessing import text
from keras import preprocessing
from keras import regularizers
from keras import activations
from keras import optimizers
from keras import callbacks
from keras import layers
from keras import losses
from keras import models

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

Parameters

In [2]:
data_path = '../data/nn_data_1680619232.pickle'
dictionary_path = '../data/dictionary_1680619234.pickle'

maxlen=200
max_words=10000
first_split = 0.8
second_split=0.8

General functions

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    epochs = range(1, len(history.history[loss_list[0]]) + 1)
    
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
def print_confusion_matrix(model, x_test, y_test, labels):
    # todo
    pass

In [None]:
def print_evaluation(model, x_test, y_test, batch_size):
    loss = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0)
    print(f"Test loss: {round(loss, 2)}")

In [3]:
def load_data(data_path, dictionary_path):
    with open(data_path, 'rb') as file:
        data = pickle.load(file)
    
    with open(dictionary_path, 'rb') as file:
        dictionary = pickle.load(file)
        dictionary = {value: key for key, value in dictionary.items()}
        
    labels_count = len(dictionary)
    texts = []
    labels = []
    for text, categories in data:
        new_categories = np.zeros(labels_count, dtype='float32')
        for category in categories:
            new_categories[category] = 1.0
            
        texts.append(text)
        labels.append(new_categories)
        
        
    return texts, labels, dictionary

In [4]:
def prepare_data(texts, labels, maxlen=None, max_words=10000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    if maxlen is not None:
        data = pad_sequences(sequences, maxlen=maxlen)
    else:
        data = pad_sequences(sequences)

    return data, labels

In [5]:
def split_data(data, labels, first_split=0.8, second_split=0.8):
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=first_split)    
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, train_size=second_split)
    return x_train, y_train, x_valid, y_valid, x_test, y_test

### Load and preprocess dataset

Load preprocessed yelp dataset and dictionary. The dictionary is then inverted

In [6]:
texts, labels, dictionary = load_data(data_path, dictionary_path)

Tokenize initially processed dataset

In [7]:
data, labels = prepare_data(texts, labels, maxlen=maxlen, max_words=max_words)

Split the data to train, valid and test datasets

In [8]:
x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(data, labels, first_split=first_split, second_split=second_split)

Print information about the datasets

In [9]:
print(f'Training samples:   {x_train.shape[0]}')
print(f'Testing samples:    {x_test.shape[0]}')
print(f'Validating samples: {x_valid.shape[0]}')

Training samples:   1537487
Testing samples:    480465
Validating samples: 384372


### Create Neural Network Model

Model parameters

In [10]:
model_name = 'tc_model_1'
embedding_dim = 25
batch_size = 128
epochs = 20

Create model architecture

In [11]:
model = models.Sequential([
    # embedding layer
    layers.Embedding(max_words, embedding_dim, input_length=len(x_train[0])),
    
    # 1st convolutional layer
    layers.Conv1D(64, 3, activation=layers.PReLU(), padding='same', kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4)),
    layers.AveragePooling1D(3),
    layers.Dropout(0.5),
    
    # 2nd convolutional layer
    layers.Conv1D(64, 3, activation=layers.PReLU(), padding='same', kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4)),
    layers.AveragePooling1D(3),
    layers.Dropout(0.5),
    
    # 3rd convolutional layer
    layers.Conv1D(64, 3, activation=layers.PReLU(), padding='same', kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4)),
    layers.AveragePooling1D(3),
    layers.Dropout(0.5),
    
    # flatten
    layers.Flatten(),
    
    # 1st dense layer
    layers.Dense(48, activation=layers.PReLU(), kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4)),
    layers.Dropout(0.5),

    # last layer
    layers.Dense(len(dictionary), activation='sigmoid', kernel_regularizer=regularizers.L1L2(l1=1e-4, l2=5e-4))
])


# compile model
model.compile(
    loss='mae',
    optimizer=optimizers.Nadam()
)

# print model summary
model.summary()

Metal device set to: Apple M1


2023-04-04 18:11:07.182375: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-04-04 18:11:07.182521: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 25)           250000    
                                                                 
 conv1d (Conv1D)             (None, 200, 64)           17664     
                                                                 
 average_pooling1d (AverageP  (None, 66, 64)           0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 66, 64)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            16576     
                                                                 
 average_pooling1d_1 (Averag  (None, 22, 64)           0         
 ePooling1D)                                            

Train the network

In [12]:
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=epochs, 
    batch_size=batch_size,
    validation_data=(x_valid, y_valid),
)


KeyboardInterrupt



Save the trained model

In [None]:
model.save(f'../data/{model_name}_{int(time.time())}.h5')

### Visualize training results

In [None]:
plot_history(history)

In [None]:
# print_confusion_matrix

In [None]:
print_evaluation(model, x_test, y_test, batch_size=batch_size)