In [None]:
!pip install -q tensorflow-text

import collections
import pathlib
import re
import os
import string

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow_datasets as tfds
import tensorflow_text as tf_text


tfds.disable_progress_bar()

[K     |████████████████████████████████| 4.3MB 28.4MB/s 
[?25h

In [None]:
BUFFER_SIZE = 40000
BATCH_SIZE = 128
VALIDATION_SIZE = 2000
VOCAB_SIZE = 30000
MAX_SEQUENCE_LENGTH = 250

EPOCHS = 10
VALIDATION_STEP = 50

# Load text

In [None]:
parent_dir = '/content/'

FILE_NAMES = ['Animation.txt', 'Adventure.txt', 'Romance.txt', 'Comedy.txt', 'Action.txt', 'Family.txt', 'History.txt',
 'Drama.txt', 'Crime.txt', 'Fantasy.txt', 'Science Fiction.txt', 'Thriller.txt', 'Music.txt', 'Horror.txt',
 'Documentary.txt', 'Mystery.txt', 'Western.txt', 'TV Movie.txt', 'War.txt', 'Foreign.txt',]
CLASSES = len(FILE_NAMES)

Converting the text files to data set

In [None]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  print(i, file_name)
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

0 Animation.txt
1 Adventure.txt
2 Romance.txt
3 Comedy.txt
4 Action.txt
5 Family.txt
6 History.txt
7 Drama.txt
8 Crime.txt
9 Fantasy.txt
10 Science Fiction.txt
11 Thriller.txt
12 Music.txt
13 Horror.txt
14 Documentary.txt
15 Mystery.txt
16 Western.txt
17 TV Movie.txt
18 War.txt
19 Foreign.txt


Shuffle the loaded data set

In [None]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

Tokenize the labeled data set

In [None]:
tokenizer = tf_text.UnicodeScriptTokenizer()

def tokenize(text, unused_label):
  lower_case = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lower_case)

tokenized_ds = all_labeled_data.map(tokenize)

for text_batch in tokenized_ds.take(1):
  print("Tokens: ", text_batch.numpy())

Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.
Tokens:  [b'prairie' b'home' b'true' b'canadian' b'iconoclast' b'acclaimed'
 b'transgender' b'countryelectropop' b'artist' b'rae' b'spoon' b'revisits'
 b'stretches' b'rural' b'alberta' b'constituted' b'home' b'confronts'
 b'memories' b'growing' b'queer' b'abusive' b'evangelical' b'household']


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

build a vocabulary by sorting tokens by frequency and keeping the top VOCAB_SIZE tokens

In [None]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
  for tok in toks:
    vocab_dict[tok] += 1

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
print(len(vocab))
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:30])

84795
Vocab size:  30000
First five vocab entries: [b'life', b'young', b'man', b'new', b'love', b'film', b'world', b'story', b'family', b'woman', b'find', b'time', b'father', b'years', b'girl', b'finds', b'war', b'wife', b'lives', b'home', b'friends', b'town', b'old', b'way', b'day', b'mother', b'school', b'people', b'son', b'help']


Tokenize the data based on vocab vector

In [None]:
keys = vocab
values = range(2, len(vocab) + 2)  # reserve 0 for padding, 1 for OOV

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64)

num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

def preprocess_text(text, label):
  standardized = tf_text.case_fold_utf8(text)
  tokenized = tokenizer.tokenize(standardized)
  vectorized = vocab_table.lookup(tokenized)
  return vectorized, label

example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())

all_encoded_data = all_labeled_data.map(preprocess_text)

Sentence:  b'prairie home true canadian iconoclast acclaimed transgender countryelectropop artist rae spoon revisits stretches rural alberta constituted home confronts memories growing queer abusive evangelical household'
Vectorized sentence:  [10111    21    72  1905 16868  1824 13576 30000   313 12125 18392  7405
 13577   829 14476 26914    21  2490   807   645 11029  1992 11030  2132]


Split the data for training and validation

In [None]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [None]:
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

Text batch shape:  (128, 98)
Label batch shape:  (128,)
First text example:  tf.Tensor(
[10111    21    72  1905 16868  1824 13576 30000   313 12125 18392  7405
 13577   829 14476 26914    21  2490   807   645 11029  1992 11030  2132
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(98,), dtype=int64)
First label example:  tf.Tensor(14, shape=(), dtype=int64)


In [None]:
vocab_size += 2

In [None]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

Create The training model

In [None]:
model = tf.keras.Sequential([
      # preprocess_layer,
      layers.Embedding(vocab_size, 32, mask_zero=True),
      # layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
      # layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
      # layers.Bidirectional(tf.keras.layers.LSTM(256,  return_sequences=True)),
      # layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
      layers.Bidirectional(tf.keras.layers.LSTM(32)),
      layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.0001), activation='relu'),
      layers.Dropout(0.1),
      # tf.keras.layers.Dense(CLASSES, activation='softmax')
      layers.Dense(CLASSES)
  ])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

In [None]:
mc = tf.keras.callbacks.ModelCheckpoint('best_model', monitor='accuracy', mode='max', verbose=1, save_weights_only=False, save_best_only=True)
mcv = tf.keras.callbacks.ModelCheckpoint('best_model_val', monitor='val_accuracy', mode='max', verbose=1, save_weights_only=False, save_best_only=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5) 

Train the model

In [None]:
history = model.fit(train_data,
                    epochs=100,
                    callbacks=[mcv, tensorboard_callback],
                    validation_data=validation_data,
                    )

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.28900, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 2/100

Epoch 00002: val_accuracy did not improve from 0.28900
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.28900
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.28900
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.28900
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.28900
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.28900
Epoch 8/100

Epoch 00008: val_accuracy improved from 0.28900 to 0.30550, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 9/100

Epoch 00009: val_accuracy did not improve from 0.30550
Epoch 10/100

Epoch 00010: val_accuracy did not improve from 0.30550
Epoch 11/100

Epoch 00011: val_accuracy improved from 0.30550 to 0.30850, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 12/100

Epoch 00012: val_accuracy improved from 0.30850 to 0.31350, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 13/100

Epoch 00013: val_accuracy improved from 0.31350 to 0.31900, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 14/100

Epoch 00014: val_accuracy did not improve from 0.31900
Epoch 15/100

Epoch 00015: val_accuracy did not improve from 0.31900
Epoch 16/100

Epoch 00016: val_accuracy did not improve from 0.31900
Epoch 17/100

Epoch 00017: val_accuracy improved from 0.31900 to 0.31950, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 18/100

Epoch 00018: val_accuracy did not improve from 0.31950
Epoch 19/100

Epoch 00019: val_accuracy did not improve from 0.31950
Epoch 20/100

Epoch 00020: val_accuracy did not improve from 0.31950
Epoch 21/100

Epoch 00021: val_accuracy improved from 0.31950 to 0.32050, saving model to best_model_val




INFO:tensorflow:Assets written to: best_model_val/assets


INFO:tensorflow:Assets written to: best_model_val/assets


Epoch 22/100

Epoch 00022: val_accuracy did not improve from 0.32050
Epoch 23/100

Epoch 00023: val_accuracy did not improve from 0.32050
Epoch 24/100

Epoch 00024: val_accuracy did not improve from 0.32050
Epoch 25/100

Epoch 00025: val_accuracy did not improve from 0.32050
Epoch 26/100

Epoch 00026: val_accuracy did not improve from 0.32050
Epoch 27/100

Epoch 00027: val_accuracy did not improve from 0.32050
Epoch 28/100

Epoch 00028: val_accuracy did not improve from 0.32050
Epoch 29/100

Epoch 00029: val_accuracy did not improve from 0.32050
Epoch 30/100

Epoch 00030: val_accuracy did not improve from 0.32050
Epoch 31/100

Epoch 00031: val_accuracy did not improve from 0.32050
Epoch 32/100

Epoch 00032: val_accuracy did not improve from 0.32050
Epoch 33/100

Epoch 00033: val_accuracy did not improve from 0.32050
Epoch 34/100

Epoch 00034: val_accuracy did not improve from 0.32050
Epoch 35/100

Epoch 00035: val_accuracy did not improve from 0.32050
Epoch 36/100

Epoch 00036: val_acc

Check the accuration of the model

In [None]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Add Prepocess layer to tokenize and encode text for predicition

In [None]:
# preprocess_layer = TextVectorization(
#     max_tokens=vocab_size,
#     # standardize=tf_text.case_fold_utf8,
#     standardize='lower_and_strip_punctuation',
#     split=tokenizer.tokenize,
#     output_mode='int',
#     output_sequence_length=MAX_SEQUENCE_LENGTH)
# preprocess_layer.set_vocabulary(vocab)

preprocess_layer = TextVectorization(
        standardize='lower_and_strip_punctuation',
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=MAX_SEQUENCE_LENGTH)
preprocess_layer.set_vocabulary(vocab)

Add the preprocess layer to the model and test it

In [None]:
best_model = tf.keras.models.load_model('best_model_val')

export_model = tf.keras.Sequential(
    [preprocess_layer, model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(1e-5),
    metrics=['accuracy'])

In [None]:
# Create a test dataset of raw strings
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)
loss, accuracy = export_model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
inputs = [
    "During a dangerous mission to stop a drug cartel operating between the US and Mexico, Kate Macer, an FBI agent, is exposed to some harsh realities.",
    "Tony Montana and his close friend Manny, build a strong drug empire in Miami. However as his power begins to grow, so does his ego and his enemies, and his own paranoia begins to plague his empire",
    "Cady joins a new public school and befriends Janis and Damian. They warn her to avoid the Plastics, a group led by Regina, but things get worse when she falls in love with Aaron, Regina's ex-lover",  # Label: 0
]
predicted_scores = export_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis=1)
for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy())

In [None]:
export_model.save('mcg')

## plot

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')

## Load models

In [None]:
# The model weights (that are considered the best) are loaded into the model.
new_model = tf.keras.models.load_model('mcg')
new_model.summary()

In [None]:
predicted_scores = new_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis=1)
for input, label in zip(inputs, predicted_labels):
  print("Question: ", input)
  print("Predicted label: ", label.numpy())

In [None]:
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)

loss, accuracy = new_model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Download the model

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
import shutil
shutil.make_archive('/content/mcg/', 'zip', 'mcg')

try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('vectors.tsv')
  files.download('metadata.tsv')
  files.download('/content/mcg.zip')