# Basic text classification

This tutorial performs text classification starting from plain text files stored on disk (as opposed to the previous notebook where we used TensorFlow Hub).

In [1]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf
from tqdm.auto import tqdm

from os.path import join
import numpy as np
import string

In [2]:
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
tf.__version__

'2.2.0'

## Sentiment analysis

Binary classification of text based on IMDB movie review dataset (similar to previous notebook)

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [5]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [6]:
os.listdir(dataset_dir)

['test', 'README', 'train', 'imdb.vocab', 'imdbEr.txt']

In [7]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['unsupBow.feat',
 'urls_neg.txt',
 'neg',
 'pos',
 'labeledBow.feat',
 'unsup',
 'urls_unsup.txt',
 'urls_pos.txt']

The `aclImdb/train/pos` and `aclImdb/train/neg` directories contain positive and negative examples of movie reviews. Let's take a look at one of them

In [8]:
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
    print(f.read())

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.


_I am going to go off-piece here as `text_dataset_from_directory` cannot currently be imported, see [this github issue]. I am also sticking with TensorFlow 2.2 here, as the pre-processing APIs used are subject to change._

## Load text into datasets

In [9]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [10]:
file_names = {
    'train':{
        'pos':os.listdir(os.path.join(dataset_dir, 'train', 'pos')),
        'neg':os.listdir(os.path.join(dataset_dir, 'train', 'neg')),
    },
    'test':{
        'pos':os.listdir(os.path.join(dataset_dir, 'test', 'pos')),
        'neg':os.listdir(os.path.join(dataset_dir, 'test', 'neg')),
    }
}

label_mapping = {'pos':1, 'neg':0}

In [11]:
train_labelled = []
test_labelled = []

Datasets must be labelled so use `tf.data.Dataset.map` to apply a labeler function to each one.

In [12]:
def read_datasets(corpus: dict, corpus_labels: dict, split_type: str, labels=['pos', 'neg']):    
    corpus[split_type] = []
    corpus_labels[split_type] = []
    
    for label in labels:
        print(split_type, label)        
        for file_name in tqdm(file_names[split_type][label]):
            with open(join(dataset_dir, split_type, label, file_name)) as f:
                text = f.readlines()
                assert len(text) == 1
                review = text[0].translate(str.maketrans('', '', string.punctuation))
                
                corpus[split_type].append(review)
                corpus_labels[split_type].append(label_mapping[label])
    
    corpus_labels[split_type] = np.array(corpus_labels[split_type])

In [13]:
corpus = {}
corpus_labels = {}
read_datasets(corpus, corpus_labels, 'train')
read_datasets(corpus, corpus_labels, 'test')

train pos


HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))


train neg


HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))


test pos


HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))


test neg


HBox(children=(FloatProgress(value=0.0, max=12500.0), HTML(value='')))




In [14]:
from shutil import rmtree

In [15]:
rmtree('./aclImdb')

In [16]:
os.remove('./aclImdb_v1.tar.gz.tar.gz')

In [17]:
corpus_labels['train']

array([1, 1, 1, ..., 0, 0, 0])

In [18]:
vocabulary = set()


In [19]:
VOCAB_SIZE = 10000
OUT_OF_VOCAB_TOKEN = '<OOV>'
PADDING_TYPE = TRUNC_TYPE = 'post'
MAX_LENGTH_REVIEW = 120

In [20]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, 
                      oov_token=OUT_OF_VOCAB_TOKEN)

In [21]:
tokenizer.fit_on_texts(corpus['train'])

In [22]:
word_index = tokenizer.word_index

In [23]:
'the' in word_index

True

In [24]:
'The' in word_index

False

In [25]:
def form_padded_sequences_from_sentences(sequences, 
                                         corpus,                                          
                                         trunc_type = TRUNC_TYPE,
                                         padding_type = PADDING_TYPE,
                                         **kwargs
                                        ):
    for split_type in corpus.keys():
        print(split_type)
        sequences[split_type] = pad_sequences(
            tokenizer.texts_to_sequences(corpus[split_type]),
            truncating=trunc_type,
            padding=padding_type,
            **kwargs)

Limit the review to `MAX_LENGTH_REVIEW` words

In [26]:
sequences = {'train':None, 'test':None}
form_padded_sequences_from_sentences(sequences, corpus, maxlen = MAX_LENGTH_REVIEW)

train
test


In [27]:
len(sequences['train'][1])

120

In [28]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [29]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '_') for i in text])

In [30]:
corpus['train'][0]

'This movie makes me want to fall in love all over againI am naming my next daughter Adelaide Just so that someone who sings like Ol Blue eyes can swoon her one day and feel the butterflies I felt hearing it sung and it wasnt even to me I give it a 910'

In [31]:
decode_review(sequences['train'][0])

'this movie makes me want to fall in love all over <OOV> am <OOV> my next daughter <OOV> just so that someone who sings like ol blue eyes can <OOV> her one day and feel the <OOV> i felt hearing it sung and it wasnt even to me i give it a 910 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'

# Build model

In [32]:
EMBEDDING_DIM = 16

In [33]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, 
                              input_length=MAX_LENGTH_REVIEW),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1)
]
)

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy']
             )
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [34]:
def shuffle_two_arrays(a, b):
    c = np.c_[a.reshape(len(a), -1), b.reshape(len(b), -1)]
    a2 = c[:, :a.size//len(a)].reshape(a.shape)
    b2 = c[:, a.size//len(a):].reshape(b.shape)
    np.random.shuffle(c)
    return a2, b2

This is a little ugly -- I tried to do it in TensorFlow but had some trouble...

In [35]:
train_data, train_data_labels =  shuffle_two_arrays(sequences['train'], corpus_labels['train'])

In [36]:
num_epochs = 10
model.fit(train_data, 
          train_data_labels, 
          epochs=num_epochs, 
          validation_data=(sequences['test'], corpus_labels['test']))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9039e63a20>

In [37]:
loss, accuracy = model.evaluate(sequences['test'], corpus_labels['test'])



In [38]:
accuracy

0.8059599995613098