In [None]:
import os
import shutil
import re
import string


import tensorflow as tf
from tensorflow.keras import utils, layers, Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, TextVectorization




# load dataset

In [None]:
data_url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = utils.get_file(
    origin=data_url,
    untar=True,
    cache_dir='.',
    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 0us/step


['imdbEr.txt', 'test', 'train', 'imdb.vocab', 'README']

In [None]:
train_dir = os.path.join(dataset_dir , 'train')

os.listdir(train_dir)

['urls_pos.txt',
 'urls_neg.txt',
 'pos',
 'urls_unsup.txt',
 'unsup',
 'unsupBow.feat',
 'labeledBow.feat',
 'neg']

In [None]:
remov = os.path.join(train_dir, 'unsup')
shutil.rmtree(remov)

In [None]:
print(train_dir)
print(os.listdir(train_dir))


/tmp/.keras/aclImdb/train
['urls_pos.txt', 'urls_neg.txt', 'pos', 'urls_unsup.txt', 'unsupBow.feat', 'labeledBow.feat', 'neg']


In [None]:
batch_size = 1024
seed = 123

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
for text_batch, label_batch in raw_train_ds.take(1): # takes one batch of data
  for i in range(3):
    print("Review:", text_batch.numpy()[i])
    print("Label:", label_batch.numpy()[i])

Review: b"(aka: BLOOD CASTLE or SCREAM OF THE DEMON)<br /><br />*spoiler*<br /><br />This was a drive-in feature, co-billed with THE VELVET VAMPIRE. A Spanish-Italian co-production where a series of women in a village are being murdered around the same time a local count named Yanos Dalmar is seen on horseback, riding off with his 'man-eating' dog behind him.<br /><br />The townsfolk already suspect he is the one behind it all and want his castle burned down. The murders first began around the time Count Yanos' older brother, Count Igor Dalmar was horribly burned and killed in a lab accident.<br /><br />Then a woman Ivanna (Erna Schuer) that Igor hired before his death to assist him in his experiments shows up. Yanos agrees to hire her in place of his brother and together they seek the formulae for the regeneration of dead cells. Yanos wants to bring Igor's charred corpse back to life.<br /><br />But of course Igor is still alive (although horribly burned) and stalking and killing the 

In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

# .cache() -> stores the data in memory after the first epoch, so the dataset does not need to be reloaded from disk in subsequent epochs
# prefetch() ->  prepare the next batch of data while the current batch is being processed by the model
# AUTOTUNE -> automatically determines the optimal buffer size for prefetching
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Embedding layer

In [None]:
embedding_layer = tf.keras.layers.Embedding(1000, 5) # 1000 -> how many unique words

In [None]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[-0.00329782,  0.01354257, -0.00340179, -0.04097612,  0.04126373],
       [-0.04239122, -0.01558629,  0.04484452,  0.02874379, -0.01611396],
       [-0.00583623,  0.02574802,  0.02937335,  0.03130514, -0.04689578]],
      dtype=float32)

# Text preprocessing

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int', #int
    output_sequence_length=sequence_length
    ) #  output_sequence_length -> Ensure all text sequences are 100 words long (pad shorter texts or truncate longer ones).

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

# Create model

In [None]:
embedding_dim=16 # the size of the vector space in which words will be embedded.

model = Sequential([
  vectorize_layer,
  Embedding(max_features, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)