# Text Classification
- Full tutorial reference: https://www.tensorflow.org/tutorials/keras/text_classification
- This tutorial demonstrates text classification and will train a binary classifier to perform sentiment analysis.

### Import libraries

In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import re
import shutil
import string

print("Tensorflow version:", tf.__version__)

Tensorflow version: 2.9.1


### Sentiment analysis
- This tutorial trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the review.
- Binary (two-class) classification is an critical machine learning problem.

### Download and explore IMDB dataset

In [3]:
# url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
# dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
# dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

### Load the dataset 
- It is required to load the data off disk.
- To do so, *text_dataset_from_directory* utility from *tf.keras.utils* module would be helpful. 

In [7]:
# remove 'unsup' folder for preparing binary classification 'pos' and 'neg'
shutil.rmtree(os.path.join(train_dir, 'unsup'))

In [24]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train',
                                                          batch_size=batch_size,
                                                          validation_split=0.2, # create a validation set using an 80:20 split
                                                          subset='training',
                                                          seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [25]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train',
                                                          batch_size=batch_size,
                                                          validation_split=0.2, # create a validation set using an 80:20 split
                                                          subset='validation',
                                                          seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [26]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test',
                                                        batch_size=batch_size)

Found 25000 files belonging to 2 classes.


### Prepare the dataset for training
- Before training, standardization, tokenization and vectorization are required:
    - Standardization refers to preprocessing the text (remove punctuation or HTML elements to simplify the dataset).
    - Tokenization refers to splitting strings into tokens.
    - Vectorization refers to converting tokens into numbers to prepare neural network training.
- All the above steps can be done by using the helpful *tf.keras.layers.TextVectorization* layer. 

In [17]:
# custom standardization function to remove the HTML elements
def custom_standardization(data):
    lowercase = tf.strings.lower(data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [18]:
max_features = 10000 # maximum token number
sequence_length = 250 # cause the layer to pad or truncate sequences to exactly sequence_length values

text_vectorize_layer = tf.keras.layers.TextVectorization(standardize=custom_standardization,
                                                        max_tokens=max_features,
                                                        output_mode='int', # create unique integer indices for each token 
                                                        output_sequence_length=sequence_length)

In [19]:
# call adapt to fit the state of the preprocessing layer to the dataset
train_text = raw_train_ds.map(lambda x, y: x)
text_vectorize_layer.adapt(train_text)

In [22]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1) # axis=-1 adds an inner most dimension
    return text_vectorize_layer(text), label

In [27]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

### Configure the dataset for performance
- *.cache()* keeps data in memory after it is loaded off disk.
- *.prefetch()* overlaps data preprocessing and model execution while training.

In [28]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Create the model
- The *tf.keras.layers* are stacked sequentially to build the model:
    - Embedding layer (first layer) takes the integer-encoded reviews and looks up an embedding vector for each word-index.
    - GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension.
- After GlobalAveragePooling1D layer, the fixed-length output vector goes through a dense layer with 16 hidden units.
- Final layer is densely connected with a single output node.

In [29]:
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_features+1, output_dim=embedding_dim),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
__________________________________________________

In [30]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [31]:
history = model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluate the model

In [32]:
result = model.evaluate(test_ds)
print('Loss, Accuracy:', result[0], result[1])

Loss, Accuracy: 0.3102016746997833 0.8737999796867371


### Export the model
- It is possible to include *TextVectorization* layer inside the model.

In [34]:
model_export = tf.keras.Sequential([
    text_vectorize_layer,
    model,
    tf.keras.layers.Activation('sigmoid')
])

model_export.compile(optimizer='adam',
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), # since sigmoid activation function is in the last layer, from_logits=False
                    metrics=['accuracy']
                    )

result = model_export.evaluate(raw_test_ds)
print('Loss, Accuracy:', result[0], result[1])

Loss, Accuracy: 0.310201495885849 0.8737999796867371


### Prediction using new data

In [38]:
examples = ['The movie was fun!', 'The movie was okay.', 'The movie was boring.', 'The movie was terrible.']
model_export.predict(examples)



array([[0.61574024],
       [0.44878286],
       [0.3458126 ],
       [0.36650437]], dtype=float32)

In [39]:
# MIT License
#
# Copyright (c) 2017 François Chollet
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.