# Stage 1: Importing dependencies

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6 (from bert-for-tf2)
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0 (from bert-for-tf2)
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30515 sha256=a814f1ccad8fe64c7f4b786f2da6a8a9406c2646550148557a271ef0ca9a7c58
  Stored in directory: /root/.cache/pip/wheel

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/Learn BERT - most powerful NLP algorithm by Google/training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [None]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

### Cleaning

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Delete the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Delete URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Just keep letters and important punctuation
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Remove additional spaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

  tweet = BeautifulSoup(tweet, "lxml").get_text()


In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

### Bert models
https://tfhub.dev/google/collections/bert/1

In [None]:
tokenizer.tokenize('My dog loves strawberries')

['my', 'dog', 'loves', 'straw', '##berries']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('My dog loves strawberries'))

[2026, 3899, 7459, 13137, 20968]

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation

We will create padded batches (so we pad sentences for each batch inpedendently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2035, 2253, 2092, 4067, 2643,  999,  999,  999], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2035,  2253,  2092,  4067,  2643,   999,   999,   999],
        [ 2082,  2153,  2044,  5353,  3084,  2033,  2025,  3407],
        [ 2129,  1005,  1055,  1996,  1038, 10259,   999,  1029],
        [ 2025,  2000,  5254,  1996,  2489,  8974,   999,   999],
        [ 2339,  2024,  1057,  2061, 10140,  1029,  5292,  3270],
        [ 3870,  1045,  2031,  2216,   999,  1999,  3897,  2205],
        [ 1045,  2941,  5223,  2033,  2166,  1012,  1012,  1012],
        [ 2188,  8434,  7975,  4372,  5428, 27266,  3022,   999],
        [ 2074,  2318,  1996,  2279, 13132,  2338,  1012,  5305],
        [ 1045,  1005,  1049,  2183,  2000,  3637, 22708,  9119],
        [ 2097,  3113,  2039,  2007,   999,  9061,  2869,   999],
        [ 2851, 10474,   999, 15315, 13669,  2651,  2025,  4826],
        [ 8038,  2100,  2005, 26587,  1998,  2924,  6209, 13499],
        [ 8038,  3363, 10047, 17111,  9541, 11471,  2157,  2085],
        [ 4390,  2003,  1045

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

# Stage 3: Model building

In [None]:
class DCNN(tf.keras.Model):

    def __init__(self,
                 vocab_size,
                 emb_dim=128,      # no. of convulational filters for each size, by default #
                 nb_filters=50,    # 50 feature detectors of size 2, 3, 4 #
                 FFN_units=512,    # no. of hidden units in TF (512 by default) #
                 nb_classes=2,     # no. of classes in the data #
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size,      # embedding layer #
                                          emb_dim)         # size for embedding vectors #
        self.bigram = layers.Conv1D(filters=nb_filters,    # focus on 2 consecutive words #
                                    kernel_size=2,         # 2 for bigrams #
                                    padding="valid",       # standard value #
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,        # 3 for trigram #
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,       # 4 for 4-gram #
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        # 2 dense layers with hidden units #
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")   # 1st dense layer #
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,                       # 2nd dense layer #
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")

    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

# Stage 4: Training

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["AUC"])                    # Check the metrics available #
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])


### Metrics for training
https://www.tensorflow.org/api_docs/python/tf/keras/metrics

### Optimisers
https://github.com/ryanxjhan/TensorFlow-2.x-Cheat-Sheet#optimizers

In [None]:
pwd

'/content'

In [None]:
#checkpoint_path = "./drive/MyDrive/projects/BERT/ckpt_bert_tok/"                # keep and reuse with checkpoints #
checkpoint_path = "./drive/MyDrive/Colab Notebooks/Learn BERT - most powerful NLP algorithm by Google"
                                                                                # way of saving #
ckpt = tf.train.Checkpoint(model=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

TypeError: ignored

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 541s 14ms/step - loss: 0.4296 - accuracy: 0.8024

NameError: ignored

# Stage 5: Evaluation

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.3739871382713318, 0.917824923992157]


In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [None]:
get_prediction("This movie was pretty interesting.")

Output of the model: [[0.9032021]]
Predicted sentiment: positive.


In [None]:
get_prediction("I'd rather not do that again.")

Output of the model: [[0.35526103]]
Predicted sentiment: negative.
