This notebook contains the code to use BERT with an extra layer of nodes added at the output to predict wether a movie review is positve or negative. Hopefully it will show the improved performance due to transfer learning at the cost of increased training time and model size compared to the models in the other notebook in this repo.

but I hope to demonstrate some of the pros and cons of using BERT compared to simpler models and vectorisation techniques.
I have also commented the code myself to demonstrate understanding of what is required to fine tune BERT to a specific task.

Training takes approx. 1 hour with GPU on google colab.
Model file is too large to send over via email unfortunately so you can't skip the training unfortunately

## MAKE SURE YOU ENABLE GPU FOR GOOGLE COLAB!!! 

In [None]:
!pip install bert-for-tf2

In [None]:

# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split




from tensorflow import keras
import os
import re

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)

## Load Data
Load dataset using Keras

In [None]:

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  # return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
  return pd.concat([pos_df, neg_df])

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df.drop(columns = ['sentiment']), test_df.drop( columns = ['sentiment'])

In [None]:
train, test = download_and_load_datasets()


In [None]:
train.head(2)

## Preprocessing
BERT creates embedding from 3 inputs: token, segment & position embeddings. Here we will create functions to create these inputs

In [None]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs

## this function creates the mask embeddings where simply 1 for real tokens and 0 for embeddings
MAX_SEQ_LEN=500 # max sequence length
def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 

In [None]:
""" this function creates the segment embeddings i.e. BERT is trained on 2 sentences to predict masked words and
    the next sentence therefore the input shpuld be 2 sentences. With 0 for the first and 1 for the second"""

def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

In [None]:
## gets token ids from BERT's vocabulary
def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

In [None]:
## tokenize the input, cut it to the max length, and then create input, mask and segment embeddings
def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)
    
    return ids, masks, segments

In [None]:
 ## create features out of whole movie review, NOT JUST FIRST 2 SENTENCES!!
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
        ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
        assert len(ids) == MAX_SEQ_LEN
        assert len(masks) == MAX_SEQ_LEN
        assert len(segments) == MAX_SEQ_LEN
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

In [None]:
## use bert tokenizer by loading bert vocabualry and tokenizer
def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    return tokenizer

## create instance of bert model
- add 768 nodes with relu's and 2 output nodes

In [None]:
def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs) # BERT outputs
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)
 
    # Add output layer
    outputs = Dense(2, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = nlp_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Lets check we are creating features correctly for reviews

In [None]:
review = train['sentence'].head(1)
tokenizer = create_tonkenizer(model.layers[3])
features = convert_sentences_to_features(review, tokenizer)
                    
print(review)
print('token_ids : ',features[0])
print('mask embeddings : ', features[1])
print('segment ids : ', features[2])

## Model Training
 Looks fine lets train the model


In [None]:




train = train.sample(frac=1) # Shuffle the dataset
train_frac = int(0.75*train.shape[0])
train_df = train[:train_frac]
val_df = train[train_frac:]

tokenizer = create_tonkenizer(model.layers[3])
X_train = convert_sentences_to_features(train_df['sentence'], tokenizer)
X_val = convert_sentences_to_features(val_df['sentence'], tokenizer)
X_test = convert_sentences_to_features(test['sentence'], tokenizer)



y_train = to_categorical(train_df['polarity'].values)
y_val = to_categorical(val_df['polarity'].values)
y_test = to_categorical(test['polarity'].values)





In [None]:
# Train the model
BATCH_SIZE = 8
EPOCHS = 2

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)
model.compile(optimizer=opt, 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1)

# Save the trained model
model.save('nlp_model.h5')

## Evaluate Model Performance

In [None]:

# Load the pretrained nlp_model
from tensorflow.keras.models import load_model
new_model = load_model('nlp_model.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
# Predict on test dataset
from sklearn.metrics import classification_report
pred_test = np.argmax(new_model.predict(X_test), axis=1)

In [None]:
print(classification_report(np.argmax(y_test,axis=1), pred_test))
