In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
print(tf.version.VERSION)

2.12.0


In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_dir='.', cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [7]:
print(os.path.dirname(dataset))

.


In [5]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdbEr.txt', 'train', 'imdb.vocab', 'README', 'test']

In [8]:
train_set = os.path.join(dataset_dir, 'train')
os.listdir(train_set)

['neg',
 'urls_neg.txt',
 'pos',
 'urls_pos.txt',
 'urls_unsup.txt',
 'labeledBow.feat',
 'unsupBow.feat',
 'unsup']

In [10]:
remove_dir = os.path.join(train_set, 'unsup')
shutil.rmtree(remove_dir)

In [11]:
os.listdir(train_set)

['neg',
 'urls_neg.txt',
 'pos',
 'urls_pos.txt',
 'urls_unsup.txt',
 'labeledBow.feat',
 'unsupBow.feat']

In [12]:
batch_size = 1024
seed = 1234
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [17]:
for text_batch, label_batch in train_ds.take(1):
  print(text_batch.shape)
  print(label_batch.shape)
  for i in range(2):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

(1024,)
(1024,)
0 b'This movie was extremely poorly conceived from every angle except technological. I stood and watched everyone waddle out of the theater, their faces drained like their lives flashed before their eyes -- eyes wandering at their neighbor, wondering if it was just them. I mean, how could the movie really be bad. Nobody\'ll admit it, it\'s a classic case of The Emperor Wears No Clothes. "Who am I to question a movie containing a guy who stops a jet liner?" But the fact remains, every member of the audience is thinking what I\'m writing right now. I actually plagiarized their faces.<br /><br />Obviously Lois is only aroused by power, she won\'t even have a cup of coffee with the Superman With Glasses who doesn\'t stop jet liners. It can\'t be the look in "his" eyes to the depths of his soul or anything like that. In the old Supermans, she had some level of connection with him, he wasn\'t priority number 1, obviously, but it strengthened her character that she was "torn".

In [18]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [19]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [32]:
res = tf.constant([1,1,3])
res

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 1, 3], dtype=int32)>

In [27]:
embedding_layer.input_dim


1000

In [33]:
result = embedding_layer(res)
result

<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[-0.04154678, -0.04529047,  0.00974279,  0.00978858,  0.04587581],
       [-0.04154678, -0.04529047,  0.00974279,  0.00978858,  0.04587581],
       [-0.00075334, -0.03950322, -0.00839678, -0.0259989 ,  0.02940358]],
      dtype=float32)>

In [34]:
res2  = tf.constant([1,1,2])
result2 = embedding_layer(res2)
result2

<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[-0.04154678, -0.04529047,  0.00974279,  0.00978858,  0.04587581],
       [-0.04154678, -0.04529047,  0.00974279,  0.00978858,  0.04587581],
       [ 0.04009653,  0.03935075, -0.01837511,  0.02238211,  0.01861404]],
      dtype=float32)>

In [35]:
result3 = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
result3.shape

TensorShape([2, 3, 5])

In [36]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')

In [41]:
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
# Also, called Tokenisation
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [42]:
from keras.layers.attention.multi_head_attention import activation
embedding_dim=16

model = Sequential()
model.add(vectorize_layer)
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation = 'relu'))
model.add(Dense(1))

In [43]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [44]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [45]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x79a5bac3b220>

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trai

In [48]:
weights = model.get_layer('embedding_1').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [49]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if  index == 0: continue # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [50]:
try:
  from google.colab import files
  files.download('aclImdb/weights/vectors.tsv')
  files.download('aclImdb/weights/metadata.tsv')
except Exception as e:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>