In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import json
import time
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

%load_ext autoreload
%autoreload 2

## Load Data


In [13]:
with open("./tweets_data/vocabulary.pkl", "rb") as f:
    vocabulary = pickle.load(f)

# load our data and separate it into tweets and labels
train_data = json.load(open('tweets_data/trainTweets_preprocessed.json', 'r'))
train_data = list(map(lambda row:(np.array(row[0],dtype=np.int32),str(row[1])),train_data))
train_tweets = np.array([t[0] for t in train_data])
train_labels = np.array([int(t[1]) for t in train_data])

test_data = json.load(open('tweets_data/testTweets_preprocessed.json', 'r'))
test_data = list(map(lambda row:(np.array(row[0],dtype=np.int32),str(row[1])),test_data))
test_tweets = np.array([t[0] for t in test_data])
test_labels = np.array([int(t[1]) for t in test_data])

print("size of original train set: {}".format(len(train_tweets)))
print("size of original test set: {}".format(len(test_tweets)))

# only select first 1000 test sample for test
test_tweets = test_tweets[:1000]
test_labels = test_labels[:1000]

print("*"*100)
print("size of train set: {}, #positive: {}, #negative: {}".format(len(train_tweets), np.sum(train_labels), len(train_tweets)-np.sum(train_labels)))
print("size of test set: {}, #positive: {}, #negative: {}".format(len(test_tweets), np.sum(test_labels), len(test_tweets)-np.sum(test_labels)))

# show text of the idx-th train tweet
# The 'padtoken' is used to ensure each tweet has the same length
idx = 100
train_text = [vocabulary[x] for x in train_tweets[idx]]
print(train_text)
sentiment_label = ["negative", "positive"]
print("sentiment: {}".format(sentiment_label[train_labels[idx]]))

size of original train set: 60000
size of original test set: 20000
****************************************************************************************************
size of train set: 60000, #positive: 30055, #negative: 29945
size of test set: 1000, #positive: 510, #negative: 490
['it', 'will', 'help', 'relieve', 'your', 'stress', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken', 'padtoken']
sentiment: positive


## **Part 1 LSTM Encoder**

The training data in the followind model receive inputs that have been one-hot encoded. To avoid unnecessary memory usage, only the batch sample is encoded during the training phase 

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Lambda

model = tf.keras.Sequential()
model.add(Lambda(lambda x: tf.one_hot(tf.cast(x, dtype='int32'), len(vocabulary)),output_shape= (20,len(vocabulary)), input_shape=(20,)))
model.add(LSTM(64, input_shape=(20,7597), activation='relu', return_sequences=False))
model.add(Dense(32,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
history_LSTM = model.fit(train_tweets, train_labels, epochs=20, batch_size=256,validation_data=(test_tweets,test_labels))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lambda (Lambda)             (None, 20, 7597)          0         
                                                                 
 lstm (LSTM)                 (None, 64)                1961472   
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,963,618
Trainable params: 1,963,618
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 1

## **Part 2: Embedding Lookup layer**

Word embedding: Instead of using a one-hot vector to represent each word, we can add an word embedding matrix in which each word is represented as a low-dimensional vector. This representation is not sparse any more, because we're working in a continuous vector space now. Words that share similar/related semantic meaning should be 'close to each other' in this vector space 

In [None]:
import time
from tensorflow.keras.layers import Embedding


inputs = keras.Input(shape=(20,), name="digits")
x1 = Embedding(60000, 64, input_length=20)(inputs)
x2 = LSTM(64, input_shape=(20,64), activation='relu', return_sequences=False)(x1)
x3 = Dense(32,activation='relu')(x2)
outputs = Dense(2,activation='softmax',name ="predictions")(x3)
model = keras.Model(inputs=inputs, outputs=outputs)

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

batch_size = 64

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((train_tweets,train_labels))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((test_tweets, test_labels))
val_dataset = val_dataset.batch(batch_size)

epochs = 20
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
      with tf.GradientTape() as tape:
        logits = model(x_batch_train, training=True)
        loss_value = loss_fn(y_batch_train, logits)
      grads = tape.gradient(loss_value, model.trainable_weights)
      optimizer.apply_gradients(zip(grads, model.trainable_weights))

      train_acc_metric.update_state(y_batch_train, logits)

      if step % 400 == 0:
          print(
              "Training loss (for one batch) at step %d: %.4f"
              % (step, float(loss_value))
          )
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))
    train_acc_metric.reset_states()

    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training loss (for one batch) at step 0: 0.6931
Training loss (for one batch) at step 400: 0.5351
Training loss (for one batch) at step 800: 0.5044
Training acc over epoch: 0.7262
Validation acc: 0.7690
Time taken: 87.09s

Start of epoch 1
Training loss (for one batch) at step 0: 0.4460
Training loss (for one batch) at step 400: 0.4601
Training loss (for one batch) at step 800: 0.5267
Training acc over epoch: 0.7892
Validation acc: 0.7670
Time taken: 87.72s

Start of epoch 2
Training loss (for one batch) at step 0: 0.4015
Training loss (for one batch) at step 400: 0.3827
Training loss (for one batch) at step 800: 0.3712
Training acc over epoch: 0.8075
Validation acc: 0.7670
Time taken: 86.89s

Start of epoch 3
Training loss (for one batch) at step 0: 0.3565
Training loss (for one batch) at step 400: 0.3220
Training loss (for one batch) at step 800: 0.3188
Training acc over epoch: 0.8217
Validation acc: 0.7630
Time taken: 87.36s

Start of epoch 4
Training loss (for one