# Classifying text with BERT and SVM

In this approach, we'll use BERT embeddings as input features to a SVM classifier.

In [1]:
import os
import shutil
import re
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

from transformers import AutoTokenizer, TFAutoModel

tf.get_logger().setLevel('ERROR')

## Dataset

The dataset used is the IMDb reviews dataset (available at [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/)).

In [2]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file(
        'aclImdb_v1.tar.gz', url,
        untar=True, cache_dir='../../data/aclImdb',
        cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

The raw dataset has train and test sets, but lacks a validation set. 20% of train set will be used to validation.

In [3]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 12
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2021-12-19 17:46:40.246716: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-19 17:46:40.247211: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-19 17:46:40.247329: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-19 17:46:40.247606: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [4]:
val_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [5]:
test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.


Analyze some of the reviews to ensure everything is working so far:

In [6]:
for text_batch, label_batch in train_ds.take(1):
    # we'll print 3 reviews from the batch
    for i in range(3):
        print(f'Review: {text_batch.numpy()[i]}')
        label = label_batch.numpy()[i]
        print(f'Label : {label} ({class_names[label]})')
        print()

Review: b'Mr Perlman gives a standout performance (as usual). Sadly, he has to struggle with an underwritten script and some nonsensical set pieces.<br /><br />Larsen is in "Die Hard" mode complete with singlet and bulging muscles, I\'m sure he could do better but seems satisfied to grimace and snarl through his part.<br /><br />The lovely Erika is very decorative (even though fully clothed!) and shows some signs of "getting" acting at last.<br /><br />SFX are mainly poor CGI and steals from other movies.<br /><br />The shootouts are pitiful - worthy of the A-Team<br /><br />Not even worth seeing for Perlman - AVOID'
Label : 0 (neg)

Review: b"I pity people calling kamal hassan 'ulaganaayakan' maybe for them ulagam is tollywood ! comeon guys..this movie is a thriller without thrill..<br /><br />come out of your ulagam and just watch some high class thrillers like The Usual Suspects or even The Silence of the Lambs.<br /><br />technically good but style over substance kamal doesn't look

2021-12-19 17:46:42.813082: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Loading model from TensorFlow HUB

In [7]:
def get_tfhub_model(use_sequences=False):
    model_size = [
        (2, 128, 2),
        (6, 256, 4),
        (10, 256, 4),
        (2, 768, 12),
        (12, 768, 12),
    ][3]

    # Number of layers (i.e., residual blocks)
    L = model_size[0]

    # Size of hidden layers
    H = model_size[1]

    # Number of attention heads
    A = model_size[2]

    tfhub_handle_encoder = f"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-{L}_H-{H}_A-{A}/2"
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
    
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    
    encoder_inputs = preprocessing_layer(input_layer)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    if use_sequences:
        return tf.keras.Model(input_layer, outputs['sequence_output'])
    
    return tf.keras.Model(input_layer, outputs['pooled_output'])

## Preparing the feature extractor

In [8]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model_transformers = TFAutoModel.from_pretrained(checkpoint)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [9]:
def get_features_transformers(model, tokenizer, X_batch, use_sequence=False):
    inputs = tokenizer([str(x) for x in X_batch], padding=True, truncation=True, return_tensors="tf")
    
    model_output = model(inputs)
    
    if use_sequence:
        new_output = tf.math.reduce_mean(model_output.last_hidden_state, 1)
        
        return new_output
    
    return [x.numpy() for x in model_output.pooler_output]

The feature extractor simply returns the output from the model.

In [10]:
def get_features_tf_hub(model, X, use_sequence=True):
    model_output = model(X)

    # wheter of not use sequence_output instead of pooled_output
    if use_sequence:
        new_output = tf.math.reduce_mean(model_output, 1)

        return new_output
    
    return model_output

In [11]:
def get_preprocessed_text(text):
    """
    Function used to tokenize, clean and stem a sentence.
    Used for TfIdfTokenizer.
    """
    text = text.lower()
    
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', ' ', text)
    
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(token) for token in tokens if token not in ENGLISH_STOP_WORDS]
    
    return ' '.join(stemmed)

## Preparing the classifiers

For the classifier, a simple SVM Classifier will be used.

In [12]:
classifier_tfhub = svm.SVC()
classifier_tfidf = svm.SVC()
classifier_transformers = svm.SVC()

## 🤗 Transformers Pipeline

Feature acquisition.

In [26]:
X_transformers = []
y_transformers = []
t = 0
for text_batch, label_batch in tqdm(train_ds):
    features_transformers = get_features_transformers(model_transformers, tokenizer, text_batch)
    t += len(features_transformers)
    
    [X_transformers.append(f) for f in features_transformers]
    [y_transformers.append(l) for l in label_batch]
    
print(len(X_transformers))

  0%|          | 0/1667 [00:00<?, ?it/s]

20000


Training.

In [14]:
print(type(X_transformers[0]))

<class 'numpy.ndarray'>


In [15]:
classifier_transformers.fit(X=X_transformers, y=y_transformers)

SVC()

Making predictions.

In [19]:
y_pred_transformers = []
y_true_transformers = []
for text_batch, label_batch in tqdm(test_ds):
    features_tranformers = get_features_transformers(model_transformers, tokenizer, text_batch)
    
    [y_pred_transformers.append(prediction)
         for prediction in classifier_transformers.predict(features_transformers)]
    [y_true_transformers.append(label_list)
         for label_list in label_batch]

  0%|          | 0/2084 [00:00<?, ?it/s]

In [25]:
len(y_pred_transformers)

16672

## TF Hub Model Pipeline

Feature acquisition.

In [None]:
# The model will be used for feature acquisition in training and prediction.
tfhub_model = get_tfhub_model(True)

In [None]:
X_tfhub = []
y_tfhub = []
for text_batch, label_batch in tqdm(train_ds):
    features_tfhub = get_features(tfhub_model, text_batch, use_sequence=True)
    
    [X_tfhub.append(f) for f in features_tfhub]
    [y_tfhub.append(l) for l in label_batch]
    
print(len(X_tfhub))

Training.

In [None]:
classifier_tfhub.fit(X=X_tfhub, y=y_tfhub)

Making predictions.

In [None]:
y_pred_tfhub = []
y_true_tfhub = []
for text_batch, label_batch in tqdm(test_ds):
    features = get_features(tfhub_model, text_batch)
    
    [y_pred_tfhub.append(prediction) for prediction in classifier_tfhub.predict(features)]
    [y_true_tfhub.append(label_list) for label_list in label_batch]

## `TfIdfVectorizer` Pipeline

Feature acquisition.

In [None]:
X_tfidf = []
y_tfidf = []
for text_batch, label_batch in tqdm(train_ds):
    [X_tfidf.append(get_preprocessed_text(doc.numpy())) for doc in text_batch]
    [y_tfidf.append(l) for l in label_batch]

The `tfidf_vectorizer` will be used in both training and prediction.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X_tfidf)

features_tfidf = tfidf_vectorizer.transform(X_tfidf)

Training.

In [None]:
X_tfidf = []
y_tfidf = []
for text_batch, label_batch in tqdm(train_ds):
    [X_tfidf.append(get_preprocessed_text(doc.numpy())) for doc in text_batch]
    [y_tfidf.append(l) for l in label_batch]

Making predictions.

In [None]:
y_pred_tfidf = []
y_true_tfidf = []
for text_batch, label_batch in tqdm(test_ds):
    for doc in text_batch:
        features = tfidf_vectorizer.transform([get_preprocessed_text(doc.numpy())])
        y_pred_tfidf.append(classifier_tfidf.predict(features))
        
    [y_true_tfidf.append(label_list) for label_list in label_batch]

## Measuring Accuracy of All Approaches

In [None]:
print(f"🤗 Transformers: {accuracy_score(y_true_tfhub, y_pred_tfhub)}")
print(f"TF Hub: {accuracy_score(y_true_tfhub, y_pred_tfhub)}")
print(f"TF-IDF: {accuracy_score(y_true_tfhub, y_pred_tfhub)}")

In [21]:
print(f"🤗 Transformers: {accuracy_score(y_true_transformers, y_pred_transformers)}")

ValueError: Found input variables with inconsistent numbers of samples: [25000, 16672]