# Word Embeddings

## Representing text as numbers

In [1]:
# the first thing you must do is come up with a strategy
# to conver strings to numbers(or to 'vectorize' the text)
# there are three strategies for doing so.

## One-hot encoding

In [2]:
# this approach is inefficient

## Encode each word with a unique number

In [3]:
# There are two downsides to this approach
# 1. the integer-encoding is arbitrary
# 2. because there is no relationship between the similarity of any 
#    two words and the similarity of their encoding, this feature-weight
#    combination is not meaningful.

## Word embeddings

In [5]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [6]:
dataset_dir = '../../data/aclImdb'
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)


['urls_unsup.txt',
 'neg',
 'urls_pos.txt',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [7]:
batch_size = 1024
seed = 123

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [9]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Wow. Some movies just leave me speechless. This was undeniably one of those movies. When I left the theatre, not a single word came to my mouth. All I had was an incredible urge to slam my head against the theatre wall to help me forget about the last hour and a half. Unfortunately, it didn't work. Honestly, this movie has nothing to recommend. The humor was at the first grade level, at best, the acting was overly silly, and the plot was astronomically far-fetched. I hearby pledge never to see an other movie starring Chris Kattan or any other cast-member of SNL."
1 b'If any show in the last ten years deserves a 10, it is this rare gem. It allows us to escape back to a time when things were simpler and more fun. Filled with heart and laughs, this show keeps you laughing through the three decades of difference. The furniture was ugly, the clothes were colorful, and the even the drugs were tolerable. The hair was feathered, the music was accompanied by roller-skates, and in the words 

In [10]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [11]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [13]:
result = embedding_layer(tf.constant([1,2,3,1]))
result.numpy()

array([[ 0.02982171,  0.0093401 ,  0.00635153, -0.01102989, -0.01340748],
       [ 0.04729998, -0.01149049, -0.04771422,  0.02998041,  0.01231867],
       [ 0.01844174,  0.02626921,  0.02673053,  0.02100826, -0.04627996],
       [ 0.02982171,  0.0093401 ,  0.00635153, -0.01102989, -0.01340748]],
      dtype=float32)

In [14]:
result.shape

TensorShape([4, 5])

In [15]:
result = embedding_layer(tf.constant([[0,1,2], [3,4,5]]))
result.shape

TensorShape([2, 3, 5])

In [16]:
result.numpy()

array([[[ 0.02983781, -0.0420953 , -0.03969896, -0.04420767,
         -0.02993977],
        [ 0.02982171,  0.0093401 ,  0.00635153, -0.01102989,
         -0.01340748],
        [ 0.04729998, -0.01149049, -0.04771422,  0.02998041,
          0.01231867]],

       [[ 0.01844174,  0.02626921,  0.02673053,  0.02100826,
         -0.04627996],
        [-0.0037847 ,  0.02906689, -0.01886067,  0.0402845 ,
          0.0196395 ],
        [-0.02685142,  0.02357025,  0.04514344,  0.01985129,
          0.01879085]]], dtype=float32)

In [18]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase,
                                            '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                   '[%s]' % re.escape(string.punctuation), '')

vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)

text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [19]:
embedding_dim = 16

model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name='embedding'),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [20]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fd32129ee10>

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [22]:
vocab = vectorize_layer.get_vocabulary()
print(vocab[:10])

# get weights matrix of layer named 'embedding'
weights = model.get_layer('embedding').get_weights()[0]
print(weights.shape)

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']
(10000, 16)


In [24]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(vocab):
    if num == 0: continue
    vec = weights[num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')

out_v.close()
out_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('vesc.tsv')
    files.download('meta.tsv')