# Benchmarking and optimization: BERT

This notebook performs benchmarking using the BERT model.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from bert.tokenization import FullTokenizer
from tensorflow.keras import backend as K
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import email
import nltk
from nltk.corpus import stopwords

In [None]:
sess = tf.Session()

In [None]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="mean",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable=True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        super(BertLayer, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path,
            trainable=self.trainable,
            name=f"{self.name}_module"
        )
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [
                var for var in trainable_vars if not "/cls/" in var.name
            ]
            trainable_layers = ["pooler/dense"]
        elif self.pooling == "mean":
            trainable_vars = [
                var for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/ in var.name"
            ]
            trainable_layers = []
        else:
            raise NameError("Undefined pooling type")
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")
        trainable_vars = [
            var for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]
        for var in trainable_vars:
            self._trainable_weights.append(var)
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        super(BertLayer, self).build(input_shape)
    
    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(
                inputs=bert_inputs,
                signature="tokens",
                as_dict=True
            )["pooled_output"]
        elif self.pooling == "mean":
            result = self.bert(
                inputs=bert_inputs,
                signature="tokens",
                as_dict=True
            )["sequence_output"]
            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(
                mul_mask(x, m),
                axis=1
            ) / (tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError("Undefined pooling type")
        return pooled
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [None]:
def build_model(max_seq_length):
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(
        shape=(max_seq_length,),
        name="segment_ids"
    )
    bert_inputs = [in_id, in_mask, in_segment]
    bert_output = BertLayer(n_fine_tune_layers=0)(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation="relu")(bert_output)
    pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    model.summary()
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [None]:
def convert_data(raw_data, header):
    converted_data, labels = [], []
    for i in range(raw_data.shape[0]):
        out = " ".join(raw_data[i])
        converted_data.append(out)
        labels.append(header[i])
    converted_data = np.array(converted_data, dtype=object)[:, np.newaxis]
    return converted_data, np.array(labels)

In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )


def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples

In [None]:
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
tokenizer = create_tokenizer_from_hub_module(bert_path)

## Enron and fraudulent emails datasets

In [None]:
enron_filepath = "../data/enron-email-dataset/emails.csv"
# We will preserve the typo in the filename as that is how it appears on Kaggle.
fraud_filepath = "../data/fraudulent-email-corpus/fradulent_emails.txt"

In [None]:
emails = pd.read_csv(enron_filepath)
emails.shape

In [None]:
emails.head()

In [None]:
def extract_messages(df):
    messages = []
    for item in df["message"]:
        e = email.message_from_string(item)
        message_body = e.get_payload()
        messages.append(message_body)
    return messages

In [None]:
bodies = extract_messages(emails)

In [None]:
bodies_df = pd.DataFrame(bodies)
bodies_df.head()

In [None]:
with open(fraud_filepath, "r", encoding="latin1") as infile:
    data = infile.read()
fraud_emails = data.split("From r")
len(fraud_emails)

In [None]:
fraud_bodies = extract_messages(
    pd.DataFrame(fraud_emails, columns=["message"], dtype=str)
)
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])
fraud_bodies_df.head()

In [None]:
print(fraud_bodies_df[0][0])

In [None]:
Nsamp = 1000
maxtokens = 50
maxtokenlen = 20

In [None]:
def tokenize(row):
    if row in [None, ""]:
        tokens = ""
    else:
        tokens = str(row).split(" ")[:maxtokens]
    return tokens

In [None]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r"[\W\d]", "", token)
            token = token[:maxtokenlen]
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [None]:
nltk.download("stopwords")

In [None]:
english_stopwords = stopwords.words("english")

In [None]:
def stop_word_removal(row):
    token = [token for token in row if token not in english_stopwords]
    token = filter(None, token)
    return token

In [None]:
EnronEmails = bodies_df.iloc[:, 0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)
SpamEmails = fraud_bodies_df.iloc[:, 0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)
raw_data = pd.concat((SpamEmails, EnronEmails), axis=0).values

In [None]:
print(raw_data.shape)
print(raw_data[:5])

In [None]:
Categories = ["spam", "notspam"]
header = [1] * Nsamp + [0] * Nsamp

In [None]:
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

### BERT training

In [None]:
raw_data, header = unison_shuffle_data(raw_data, header)
idx = int(0.7 * raw_data.shape[0])
train_x, train_y = convert_data(raw_data[:idx], header[:idx])
test_x, test_y = convert_data(raw_data[idx:], header[idx:])

In [None]:
train_x[:5]

In [None]:
train_examples = convert_text_to_examples(train_x, train_y)
test_examples = convert_text_to_examples(test_x, test_y)

In [None]:
(
    train_input_ids,
    train_input_masks,
    train_segment_ids,
    train_labels
) = convert_examples_to_features(
    tokenizer,
    train_examples,
    max_seq_length=maxtokens
)
(
    test_input_ids,
    test_input_masks,
    test_segment_ids,
    test_labels
) = convert_examples_to_features(
    tokenizer,
    test_examples,
    max_seq_length=maxtokens
)

In [None]:
model = build_model(maxtokens)

In [None]:
initialize_vars(sess)

In [None]:
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids],
    train_labels,
    validation_data=(
        [test_input_ids, test_input_masks, test_segment_ids],
        test_labels
    ),
    epochs=5,
    batch_size=32
)

## IMDB movie reviews dataset

In [None]:
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (("neg", 0), ("pos", 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), "r") as reader:
                text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    return data, sentiments

In [None]:
train_path = os.path.join("..", "data", "aclImdb", "train")
raw_data, raw_header = load_data(train_path)

In [None]:
print(raw_data.shape)
print(len(raw_header))

In [None]:
random_indices = np.random.choice(
    range(len(raw_header)), size=(Nsamp * 2,),
    replace=False
)
raw_data = raw_data[random_indices]
raw_header = raw_header[random_indices]

In [None]:
# Ensure roughly balanced class distribution.
unique_elements, counts_elements = np.unique(raw_header, return_counts=True)
print(unique_elements)
print(counts_elements)

### BERT training

In [None]:
raw_data, raw_header = unison_shuffle_data(raw_data, raw_header)
idx = int(0.7 * raw_data.shape[0])
train_x, train_y = convert_data(raw_data[:idx], raw_header[:idx])
test_x, test_y = convert_data(raw_data[idx:], raw_header[idx:])

In [None]:
train_examples = convert_text_to_examples(train_x, train_y)
test_examples = convert_text_to_examples(test_x, test_y)

In [None]:
(
    train_input_ids,
    train_input_masks,
    train_segment_ids,
    train_labels
) = convert_examples_to_features(
    tokenizer,
    train_examples,
    max_seq_length=maxtokens
)
(
    test_input_ids,
    test_input_masks,
    test_segment_ids,
    test_labels
) = convert_examples_to_features(
    tokenizer,
    test_examples,
    max_seq_length=maxtokens
)

In [None]:
model = build_model(maxtokens)

In [None]:
initialize_vars(sess)

In [None]:
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids],
    train_labels,
    validation_data=(
        [test_input_ids, test_input_masks, test_segment_ids],
        test_labels
    ),
    epochs=5,
    batch_size=32
)