### Sentiment Analysis using TensofFlow

In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_sa

In [None]:
# Configuration
config = {
    'smoke_test_size': 500,  # Length of training set. 0 for all reviews.
    'epochs': 10,             # Total number of epochs
    'batch_size': 100,        # Batch size for each epoch
    'training_dim': 200,     # Number of tokens (words) to put into each review.
    'vocab_size': 7000,      # Vocabulary size
    'output_size': 1,
    'embedding_dim': 400,
    'hidden_dim': 256,
    'n_layers': 2,
    'lr': 0.001
}

In [None]:
tfds.disable_progress_bar()

dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']



In [3]:
buffer_size = 10000
batch_size = config['batch_size']

train_dataset, valid_dataset = tensorflow_sa.get_train_valid_data(config)
test_dataset = tensorflow_sa.get_test_data(config)

#train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
#valid_dataset = valid_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
#test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

2022-03-14 14:42:53.763586: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
#print(train_dataset.shape())
#print(valid_dataset.shape())
print(type(train_dataset))
print(type(valid_dataset))

#info
#train_dataset.element_spec

In [None]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

In [None]:

for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
vocab_size = config['vocab_size']
encoder = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
encoder.adapt(train_dataset.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

In [4]:

embedding_dim = config['embedding_dim']
hidden_dim = config['hidden_dim']
lr = config['lr']
output_size = config['output_size']
vocab_size = config['vocab_size']

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_dim)),
    tf.keras.layers.Dense(hidden_dim, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(output_size)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(lr),
              metrics=['accuracy'])

In [6]:
epochs = config['epochs']

history = model.fit(train_dataset, epochs=epochs,
                    validation_data=valid_dataset, validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Loss and accuracy with two layers and using my encoding:
    Test Loss: 0.3576638996601105
    Test Accuracy: 0.8377599716186523

Loss and accuracy with two layers, using my parameters, my encoding, with better shuffling:
    Test Loss: 0.3856273293495178
    Test Accuracy: 0.834119975566864

In [None]:
today = datetime.now()
model_path = f'sa_lstm_local_{today.year}_{today.month:02}_{today.day:02}.tf'
tensorflow_sa.save_model(model, model_path)
model_path

In [None]:
model = tensorflow_sa.load_model(model_path)

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))

predictions