In [2]:
#https://www.tensorflow.org/text/tutorials/text_classification_rnn
#code from TensorFlow tutorials
#TODO: adapt to use simple tweet data rather than pre-built TF datasets

In [27]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers

In [28]:
tfds.disable_progress_bar()

In [29]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [30]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)

In [31]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [32]:
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [33]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


2022-01-15 15:13:58.112917: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [34]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [35]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [36]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'"La Lupa Mannara" aka. "Werewolf Woman" of 1976 is a film with a highly promising title, but, sadly, the film itself is pretty far away from being a must-see for my fellow Italian Horror buffs. You won\'t hear me say that Rino Di Silvestri\'s film is entirely bad - it has its stylish moments, and the first half is actually great fun to watch (though the fun is unintentional). The film also profits from an exceptionally exhibitionist leading actress, Annik Borel. However, the film, which has no real plot (at least no linear one) often makes no sense at all, and it drags incredibly throughout the mostly superfluous second half.<br /><br />Daniella (Annik Borel) has strange dreams about a dancing around naked in the night before turning into a Werewolf Woman. Since she was a raped as a girl, Daniella is afraid of men. Then, when her sister (cult siren Dagmar Lassander) comes to visit with her husband, Daniella suddenly feels attracted to the husband and subsequently turns into 

2022-01-15 15:13:58.257290: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [37]:
VOCAB_SIZE = 1000

In [38]:
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)

In [39]:
encoder.adapt(train_dataset.map(lambda text, label: text))

2022-01-15 15:13:58.293966: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [40]:
vocab = np.array(encoder.get_vocabulary())

In [41]:
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [45]:
#typical text encoding via vocab (sorted by frequency) indexing
encoded_example = encoder(example)[:3].numpy(); encoded_example

array([[979,   1,   1, ...,   0,   0,   0],
       [ 10,   1,  90, ...,   0,   0,   0],
       [ 74, 685,  56, ...,   0,   0,   0]])

In [44]:
#the vectorization is a little lossy
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'"La Lupa Mannara" aka. "Werewolf Woman" of 1976 is a film with a highly promising title, but, sadly, the film itself is pretty far away from being a must-see for my fellow Italian Horror buffs. You won\'t hear me say that Rino Di Silvestri\'s film is entirely bad - it has its stylish moments, and the first half is actually great fun to watch (though the fun is unintentional). The film also profits from an exceptionally exhibitionist leading actress, Annik Borel. However, the film, which has no real plot (at least no linear one) often makes no sense at all, and it drags incredibly throughout the mostly superfluous second half.<br /><br />Daniella (Annik Borel) has strange dreams about a dancing around naked in the night before turning into a Werewolf Woman. Since she was a raped as a girl, Daniella is afraid of men. Then, when her sister (cult siren Dagmar Lassander) comes to visit with her husband, Daniella suddenly feels attracted to the husband and subsequently turns int

In [51]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [64]:
sample_text = ('This movie was terrible. I hated it. I never want to see it again. It was the worst. A bad movie. Truly awful.')

In [65]:
predictions = model.predict(np.array([sample_text]))

In [66]:
print(predictions[0])

[-0.01128443]


In [67]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [68]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, None, 64)          64000     
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 138,369
Trainable params: 138,369
Non-tr

In [69]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10


2022-01-15 15:23:24.096271: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-15 15:23:24.559156: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-15 15:23:24.662755: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-15 15:23:27.900393: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-01-15 15:23:28.007434: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 30/391 [=>............................] - ETA: 1:03:36 - loss: 0.6932 - accuracy: 0.4911

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)