# Classifying text with BERT and SVM

In this approach, we'll use BERT embeddings as input features to a SVM classifier.

In [46]:
import os
import shutil
import re
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

tf.get_logger().setLevel('ERROR')

## Dataset

The dataset used is the IMDb reviews dataset (available at [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/)).

In [47]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file(
        'aclImdb_v1.tar.gz', url,
        untar=True, cache_dir='../../data/aclImdb',
        cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

The raw dataset has train and test sets, but lacks a validation set. 20% of train set will be used to validation.

In [48]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [49]:
val_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [50]:
test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.


Analyze some of the reviews to ensure everything is working so far:

In [51]:
for text_batch, label_batch in train_ds.take(1):
    # we'll print 3 reviews from the batch
    for i in range(3):
        print(f'Review: {text_batch.numpy()[i]}')
        label = label_batch.numpy()[i]
        print(f'Label : {label} ({class_names[label]})')
        print()

Review: b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label : 0 (neg)

Review: b"I pity people calling kamal hassan 'ulaganaayakan' maybe for them ulagam is tollywood ! comeon guys..this movie is a thriller without thrill..<br /><br />come out of your ulagam and just watch some high class

2021-12-18 17:58:31.556509: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## Loading model from TensorFlow HUB

In [52]:
def get_tfhub_model(use_sequences=False):
    model_size = [
        (2, 128, 2),
        (6, 256, 4),
        (10, 256, 4),
        (2, 768, 12),
        (12, 768, 12),
    ][3]

    # Number of layers (i.e., residual blocks)
    L = model_size[0]

    # Size of hidden layers
    H = model_size[1]

    # Number of attention heads
    A = model_size[2]

    tfhub_handle_encoder = f"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-{L}_H-{H}_A-{A}/2"
    tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
    
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    
    encoder_inputs = preprocessing_layer(input_layer)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    if use_sequences:
        return tf.keras.Model(input_layer, outputs['sequence_output'])
    
    return tf.keras.Model(input_layer, outputs['pooled_output'])

## Preparing the feature extractor

The feature extractor simply returns the output from the model.

In [58]:
def get_features(model, X, use_sequence=True):
    model_output = model(X)

    if use_sequence:
        new_output = tf.math.reduce_mean(model_output, 1)

        return new_output
    
    return model_output

In [54]:
def get_preprocessed_text(text):
    text = text.lower()
    
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', ' ', text)
    
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(token) for token in tokens if token not in ENGLISH_STOP_WORDS]
    
    return ' '.join(stemmed)

## Preparing the classifier

For the classifier, a simple SVM Classifier will be used.

In [55]:
classifier_tfhub = svm.SVC()
classifier_tfidf = svm.SVC()
classifier_transformers = svm.SVC()

## Making predictions

In [56]:
tfhub_model = get_tfhub_model(True)

Training with TF Hub model.

In [59]:
X_tfhub = []
y_tfhub = []
for text_batch, label_batch in tqdm(train_ds):
    features_tfhub = get_features(tfhub_model, text_batch, use_sequence=True)
    
    [X_tfhub.append(f) for f in features_tfhub]
    [y_tfhub.append(l) for l in label_batch]
    
print(len(X_tfhub))

classifier_tfhub.fit(X=X_tfhub, y=y_tfhub)

  0%|          | 0/1250 [00:00<?, ?it/s]

20000


SVC()

Predicting values using tfhub's BERT features.

In [60]:
y_pred_tfhub = []
y_true_tfhub = []
for text_batch, label_batch in tqdm(test_ds):
    features = get_features(tfhub_model, text_batch)
    
    [y_pred_tfhub.append(prediction) for prediction in classifier_tfhub.predict(features)]
    [y_true_tfhub.append(label_list) for label_list in label_batch]

  0%|          | 0/1563 [00:00<?, ?it/s]

Training With TF-IDF features.

In [61]:
X_tfidf = []
y_tfidf = []
for text_batch, label_batch in tqdm(train_ds):
    [X_tfidf.append(get_preprocessed_text(doc.numpy())) for doc in text_batch]
    [y_tfidf.append(l) for l in label_batch]

  0%|          | 0/1250 [00:00<?, ?it/s]

In [62]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X_tfidf)

features_tfidf = tfidf_vectorizer.transform(X_tfidf)
classifier_tfidf.fit(X=features_tfidf, y=y_tfidf)

SVC()

Predic values using TF-IDF features.

In [None]:
y_pred_tfidf = []
y_true_tfidf = []
for text_batch, label_batch in tqdm(test_ds):
    for doc in text_batch:
        features = tfidf_vectorizer.transform([get_preprocessed_text(doc.numpy())])
        y_pred_tfidf.append(classifier_tfidf.predict(features))
        
    [y_true_tfidf.append(label_list) for label_list in label_batch]

  0%|          | 0/1563 [00:00<?, ?it/s]

Acquiring accuracy.

In [None]:
accuracy_score(y_true_tfhub, y_pred_tfhub)

In [None]:
accuracy_score(y_true_tfidf, y_pred_tfidf)