https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import losses
from keras import regularizers
from keras.models import load_model
import os
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split

In [2]:
print(tf.version.VERSION)

2.4.1


In [3]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [4]:
import pandas as pd

In [5]:
test = pd.read_csv('test.csv')
df_test = pd.DataFrame(test)

train = pd.read_csv('train.csv')
df_train = pd.DataFrame(train)

val = pd.read_csv('val.csv')
df_val = pd.DataFrame(val)

In [6]:
df_test['sentiment'] = pd.Categorical(df_test['sentiment'])
df_test['sentiment'] = df_test.sentiment.cat.codes

df_train['sentiment'] = pd.Categorical(df_train['sentiment'])
df_train['sentiment'] = df_train.sentiment.cat.codes

df_val['sentiment'] = pd.Categorical(df_val['sentiment'])
df_val['sentiment'] = df_val.sentiment.cat.codes

In [7]:
a = tf.keras.utils.to_categorical(df_train['sentiment'], num_classes=3)
dataset = tf.data.Dataset.from_tensor_slices((df_train.check_stop_no_emoji.values, a))

In [8]:
a = tf.keras.utils.to_categorical(df_val['sentiment'], num_classes=3)
dataset_val = tf.data.Dataset.from_tensor_slices((df_val.check_stop_no_emoji.values, a))

In [9]:
a = tf.keras.utils.to_categorical(df_test['sentiment'], num_classes=3)
dataset_test = tf.data.Dataset.from_tensor_slices((df_test.check_stop_no_emoji.values, a))

In [10]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [11]:
train_ds_size = int(df_train.shape[0])
val_ds_size = int(df_val.shape[0])
test_ds_size = int(df_test.shape[0])

train_data = dataset.take(train_ds_size)
val_data = dataset_val.take(val_ds_size)
test_data = dataset_test.take(test_ds_size)

In [12]:
print(len(list(train_data.as_numpy_iterator())))
print(len(list(val_data.as_numpy_iterator())))
print(len(list(test_data.as_numpy_iterator())))

7114
197
594


In [13]:
#all_dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
train_dataset = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [14]:
for example, label in val_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b'please allow dine restaurants food premises people take granted lepak'
 b'salahkan kerajaan tak pkp baik pndg diri bila buka sempadan negeri berjalan tidak covid kerajaan tak kena pkp salah tidak pkp salah larangan rentas negeri salah tak larangan rentas negeri salah'
 b'alhamduli semoga esok banyak berkurangan amin syabas']

labels:  [[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [15]:
for example, label in test_dataset.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])

texts:  [b'terima kasih rakyat malaysia membantu menjayakan penurunan angka kes baharu kekalkan patuhi sop'
 b'dear government beautician hairstylists industry already eat grass no money no income'
 b'data health ministry indicates number people getting tested covid dropped people tested feb people tested sunday daily testing dropped not sure comparing correctly']

labels:  [[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


### Create Text Encoder

In [16]:
VOCAB_SIZE=10155
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [17]:
len(encoder.get_vocabulary())

10155

The .adapt method sets the layer's vocabulary. Here are the first 20 tokens. After the padding and unknown tokens they're sorted by frequency:

In [18]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'kes', 'tidak', 'pkp', 'covid', 'boleh', 'tak',
       'turun', 'alhamdulillah', 'rakyat', 'tapi', 'sop', 'negeri',
       'baru', 'kena', 'semoga', 'jangan', 'bila', 'banyak'], dtype='<U32')

Once the vocabulary is set, the layer can encode text into indices. The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed output_sequence_length):

In [19]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[  75,   84,   10,   22,  411, 7926,  134,   32,    2,  135,  684,
         251,   12,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [1858,  243,    1,    1, 3260,  723, 4693,    1,   91, 2449,   91,
         924,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 307,  992, 2046, 9185,  343,  188,  656,  951,    5, 2617,  188,
         951,  625,  188,  951, 5979,  494,  464, 2617,   95, 1117,    1,
        4820,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0

With the default settings, the process is not completely reversible. There are three main reasons for that:

1. The default value for preprocessing.TextVectorization's standardize argument is "lower_and_strip_punctuation".
2. The limited vocabulary size and lack of character-based fallback results in some unknown tokens.

In [20]:
for n in range(3):
    print("Original: ", example[n].numpy())
    print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
    print()

Original:  b'terima kasih rakyat malaysia membantu menjayakan penurunan angka kes baharu kekalkan patuhi sop'
Round-trip:  terima kasih rakyat malaysia membantu menjayakan penurunan angka kes baharu kekalkan patuhi sop                                        

Original:  b'dear government beautician hairstylists industry already eat grass no money no income'
Round-trip:  dear government [UNK] [UNK] industry already eat [UNK] no money no income                                         

Original:  b'data health ministry indicates number people getting tested covid dropped people tested feb people tested sunday daily testing dropped not sure comparing correctly'
Round-trip:  data health ministry indicates number people getting tested covid dropped people tested feb people tested sunday daily testing dropped not sure [UNK] correctly                              



<img src="bidirectional.png">

In [21]:
bilstm = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [28]:
checkpoint_path = "training_1/one_bilstm_no_emoji.ckpt"
bilstm.load_weights(checkpoint_path)
loss,acc,micro_f1,macro_f1 = bilstm.evaluate(test_dataset, verbose = 2, batch_size = 32)
print("lss: %.4f" % (loss))
print("acc: %.4f" % (acc))
print("micro f1: %.4f" % (micro_f1))
print("macro f1: %.4f" % (macro_f1))

10/10 - 0s - loss: 0.7244 - accuracy: 0.7239 - micro_f1_score: 0.7239 - macro_f1_score: 0.6457
lss: 0.7244
acc: 0.7239
micro f1: 0.7239
macro f1: 0.6457


### Simple RNN

In [29]:
rnn = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [34]:
checkpoint_path = "training_1/simple_rnn_no_emoji.ckpt"
rnn.load_weights(checkpoint_path)
loss,acc,micro_f1,macro_f1 = rnn.evaluate(test_dataset, verbose = 2, batch_size = 32)
print("lss: %.4f" % (loss))
print("acc: %.4f" % (acc))
print("micro f1: %.4f" % (micro_f1))
print("macro f1: %.4f" % (macro_f1))

10/10 - 0s - loss: 0.8282 - accuracy: 0.6852 - micro_f1_score: 0.6852 - macro_f1_score: 0.6264
lss: 0.8282
acc: 0.6852
micro f1: 0.6852
macro f1: 0.6264


### Simple LSTM

In [37]:
lstm = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        #embeddings_regularizer = tf.keras.regularizers.L2(0.01),
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dense(3)
])

In [42]:
checkpoint_path = "training_1/simple_lstm_no_emoji.ckpt"
lstm.load_weights(checkpoint_path)
loss,acc,micro_f1,macro_f1 = lstm.evaluate(test_dataset, verbose = 2, batch_size = 32)
print("lss: %.4f" % (loss))
print("acc: %.4f" % (acc))
print("micro f1: %.4f" % (micro_f1))
print("macro f1: %.4f" % (macro_f1))

10/10 - 0s - loss: 0.8427 - accuracy: 0.7306 - micro_f1_score: 0.7306 - macro_f1_score: 0.6642
lss: 0.8427
acc: 0.7306
micro f1: 0.7306
macro f1: 0.6642


## Two Layer biLSTM

In [46]:
two_bilstm = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='softmax'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3)
])

In [51]:
checkpoint_path = "training_1/two_bilstm_emoji.ckpt"
two_bilstm.load_weights(checkpoint_path)
loss,acc,micro_f1,macro_f1 = two_bilstm.evaluate(test_dataset, verbose = 2, batch_size = 32)
print("lss: %.4f" % (loss))
print("acc: %.4f" % (acc))
print("micro f1: %.4f" % (micro_f1))
print("macro f1: %.4f" % (macro_f1))

10/10 - 0s - loss: 0.7902 - accuracy: 0.7071 - micro_f1_score: 0.7071 - macro_f1_score: 0.6215
lss: 0.7902
acc: 0.7071
micro f1: 0.7071
macro f1: 0.6215


In [54]:
sample_text = ('relax malaysia still million peoples not infected')

predictions = rnn.predict(np.array([sample_text]))
print("rnn: ")
print(predictions[0])

predictions = lstm.predict(np.array([sample_text]))
print("lstm: ")
print(predictions[0])

predictions = bilstm.predict(np.array([sample_text]))
print("one layer bilstm: ")
print(predictions[0])

predictions = two_bilstm.predict(np.array([sample_text]))
print("two layer bilstm: ")
print(predictions[0])

rnn: 
[ 1.6851637 -1.3144987 -1.4902252]
lstm: 
[ 1.8899117 -1.5894849 -1.7842244]
one layer bilstm: 
[ 1.3816768 -0.7655858 -1.315508 ]
two layer bilstm: 
[-0.574112   0.8526867 -1.0519863]
