In [None]:
import io
import os
import re
import shutil
import string

import tensorflow as tf

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [None]:
dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, True, cache_dir='.', cache_subdir='')

In [None]:
os.listdir(os.path.dirname(dataset))

In [None]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [None]:
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
os.listdir(train_dir)

In [None]:
batch_size = 1024
seed = 123

train_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed)

In [None]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

In [None]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [None]:
result = embedding_layer(tf.constant([1, 2, 3]))
result

In [None]:
result = embedding_layer(tf.constant([1, 2, 3, 4, 5]))
result

In [None]:
result = embedding_layer(tf.constant([[1, 2, 3], [4, 5, 6]]))
result

In [None]:
def custom_standardization(input_data):
    lower_case = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lower_case, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [None]:
vocab_size = 10000
sequence_length = 100

In [None]:
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size, standardize=custom_standardization, output_sequence_length=sequence_length)

In [None]:
test_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(test_ds)

In [None]:
embedding_dim = 16

model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim, name='embedding'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, 'relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [None]:
model.compile('adam', loss=tf.keras.losses.BinaryCrossentropy(True), metrics=['accuracy'])

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=15, callbacks=[tensorboard_callback])

In [None]:
model.summary()

In [None]:
!load_ext tensorboard
!tensorboard --logdir logs

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
weights

In [None]:
vocab = vectorize_layer.get_vocabulary()
# vocab

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue
    
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + '\n')
    out_m.write(word + '\n')

out_v.close()
out_m.close()

In [None]:
try:
    from google.colab import files
    files.download('vectors.tsv')
    foles.download('metadata.tsv')
except Exception:
    pass