In [1]:
import io
import os
import re
import shutil
import string
from datetime import datetime

import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Get data

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_dir='.', cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [3]:
# checking training folder
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [4]:
# remove unnessary folders
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:
# create train and val dataset from the training folder
batch_size = 1024
seed = 123

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2, # also split some validation set, created later
    subset='training', # specify this invoke is for training set
    seed=seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation', # specify this invoke is for validation set
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
# check some data
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1 b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (without

In [7]:
# configure dataset
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Embedding layer

In [8]:
# embed a 1000 word vocab into 5 dimensions
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [9]:
# embedding is intialized with random weights
# an input integer is mapped to the embedding vector
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[-0.03453887,  0.02624008,  0.0061922 , -0.01113962, -0.0266305 ],
       [-0.00427425, -0.00174845,  0.01750431, -0.01562049, -0.04429134],
       [-0.03524852,  0.01936065,  0.0251439 ,  0.01519961,  0.00486571]],
      dtype=float32)

In [10]:
# embedding layer takes 2D input tensor of integers (samples, sequence_length)
# outputs (samples, sequence_length, N) where N is the embedding dimension
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

# Text preprocessing

In [11]:
# custom standardization function to remove html tags
def custom_standardization(input_data):
    # lower case input
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

# define vocab size and sequence length
vocab_size = 10000
sequence_length = 100

# use TextVectorization layer to normalize, split, and map strings to integers, which then can be used to train embedding layer
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length # set max length as samples not in same length
)

# make text only dataset
text_ds = train_ds.map(lambda x, y: x)
# adapt vectorization layer to training set
vectorize_layer.adapt(text_ds)

# Build model

In [12]:
embedding_dim = 16

model = Sequential([
    vectorize_layer, # transforms strings to vocab indices
    Embedding(vocab_size, embedding_dim, name='embedding'), # embedding vector is trained
    GlobalAveragePooling1D(), # outputs a fixed-length vector for each sample by averaging, necessary because various sample length
    Dense(16, activation='relu'), 
    Dense(1)
])

In [13]:
# Tensorboard callback
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [14]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [15]:
model.fit(train_ds, validation_data=val_ds, epochs=15, callbacks=[tensorboard_callback])

Epoch 1/15
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x2825b19dd60>

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


# Save word embedding

In [18]:
# get embedding weights
weights = model.get_layer('embedding').get_weights()[0]

# get vocab
vocab = vectorize_layer.get_vocabulary()

In [19]:
# save weights to disk
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue # skip 0, it is padding
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
    out_m.write(word + '\n')
out_v.close()
out_m.close()