Goal: Run [BERT](https://arxiv.org/pdf/1810.04805.pdf) on the nlp-challenge disaster tweet dataset following [tensorflow hub docs]( https://www.tensorflow.org/official_models/fine_tuning_bert) and [this kaggle notebook](https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub).

Following Geron's 2019 text, we shall also analyze this model in detail using the confusion matrices, PR curves, and the ROC curve.

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split

import tensorflow_hub as hub

# Load the required submodules
from  official.nlp.bert import tokenization

In [2]:
from pathlib import Path

project_dir = Path("").resolve().parents[0]

## Download data

The code below is functional on Ubuntu.

In [6]:
!cd .. && mkdir -p data && cd data && mkdir -p raw

In [None]:
!cd ../data/raw  && kaggle competitions download -c nlp-getting-started

In [10]:
import zipfile

with zipfile.ZipFile(project_dir / "data/raw/nlp-getting-started.zip" ) as z:
    z.extractall(project_dir/ "data/raw")

In [11]:
!ls "../data/raw"

nlp-getting-started.zip  sample_submission.csv	test.csv  train.csv


In [3]:
submission = pd.read_csv(project_dir / "data/raw/sample_submission.csv")
train = pd.read_csv(project_dir / "data/raw/train.csv")
test = pd.read_csv(project_dir / "data/raw/test.csv")

In [4]:
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Submission shape: {submission.shape}")

Train shape: (7613, 5)
Test shape: (3263, 4)
Submission shape: (3263, 2)


## Helper Functions

In [5]:
def preprocess(texts):
    """
        Krishan added this on 2019,09,27 based on "https://www.kaggle.com/gunesevitan/nlp-with-disaster-tweets-eda-cleaning-and-bert".
    """
    processed_texts = []
    for text in texts:
        # First remove urls:
        processed_text =  tf.strings.regex_replace(text, b"https?:\/\/t.co\/[A-Za-z0-9]+", b" ")

        # Remove new lines:
        processed_text = tf.strings.regex_replace(processed_text, b"\n", b" ")

        # Finally get rid of any non-characters:
        processed_text = tf.strings.regex_replace(processed_text, b"[^a-zA-Z']", b" ")
        
        processed_texts.append(processed_text.numpy())
    
    return processed_texts

In [6]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [7]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    out = tf.keras.layers.Dropout(0.5)(clf_output)
    out = Dense(1, activation='sigmoid')(out)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5, epsilon=1e-08), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

## Load and Preprocess

- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the BERT layer
- Encode the text into tokens, masks, and segment flags

In [8]:
MAX_LEN = 160

In [10]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 6.13 s, sys: 1.42 s, total: 7.55 s
Wall time: 6.94 s


In [11]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [12]:
# Preprocess the input data and create a train/test split.
processed_train = preprocess(train.text.values)
processed_test = preprocess(test.text.values)

X = np.array(processed_train)
y = train.target.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state=0, stratify=y_test)

X_train = bert_encode(X_train, tokenizer, max_len=MAX_LEN)
X_val = bert_encode(X_val, tokenizer, max_len=MAX_LEN)
X_test = bert_encode(X_test, tokenizer, max_len=MAX_LEN)
submission_input = bert_encode(processed_test, tokenizer, max_len=MAX_LEN)

## Model: Build and Train

In [13]:
model = build_model(bert_layer, max_len=MAX_LEN)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 108310273   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [14]:
%%time
checkpoint = ModelCheckpoint( str(project_dir / 'models/model.h5'), monitor='val_loss', save_best_only=True)

train_history = model.fit(
    X_train, y_train,
    validation_data= [X_val, y_val],
    epochs=3,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 3min 39s, sys: 13.5 s, total: 3min 52s
Wall time: 4min 12s


## Model analysis

First take a look at the classification report using the standard treshold of p=0.5.

In [73]:
data, targets = X_val, y_val

In [74]:
from sklearn.metrics import classification_report

# BERT preds
model.load_weights(str(project_dir / 'models/model.h5'))
bert_preds = model.predict(data)

print(classification_report(targets, bert_preds.round().astype(int)))

              precision    recall  f1-score   support

           0       0.79      0.94      0.86       434
           1       0.89      0.67      0.76       327

    accuracy                           0.82       761
   macro avg       0.84      0.80      0.81       761
weighted avg       0.83      0.82      0.82       761



In [75]:
confusion_matrix(targets, bert_preds.round().astype(int), normalize="true")

array([[0.93778802, 0.06221198],
       [0.33333333, 0.66666667]])

In [76]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(targets, bert_preds)

In [77]:
df = pd.DataFrame( np.array([thresholds, precisions[:-1], recalls[:-1] ]).T, columns = ["thresholds", "precisions", "recalls"] )

In [98]:
import plotly.express as px

fig = px.line(df,x=df["thresholds"], y = ["precisions", "recalls"], title = "Precision and Recall curves")
fig.show()

In [102]:
fig = px.line(df,x=df["recalls"], y = df["precisions"], title = "Precision vs Recall")
fig.show()

In [79]:
# Get the treshold with precision and recall both above 0.8.
threshold_80_precision = thresholds[np.argmax(precisions >= 0.804)]
threshold_80_precision

0.29550633

In [80]:
# Calculate the confusion matrix with the new threshold.
print(classification_report(targets, (bert_preds >= threshold_80_precision).astype(int)))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       434
           1       0.80      0.80      0.80       327

    accuracy                           0.83       761
   macro avg       0.83      0.82      0.83       761
weighted avg       0.83      0.83      0.83       761



In [81]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

confusion_matrix(targets, (bert_preds >= threshold_80_precision).astype(int), normalize="true")

array([[0.85483871, 0.14516129],
       [0.20489297, 0.79510703]])

In [99]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(targets, bert_preds)

As a rule we prefer the PR curve whenever the positive class is rare, or when we care more about false positives than false negatives (cf. Geron). In other cases, we can use the ROC curve.

In [100]:
import plotly.graph_objects as go

fig = px.line(x=fpr, y = tpr, title="ROC Curve", labels={ "x": "False positive rate", "y" : "True positive rate (recall)"})
fig.add_trace(go.Scatter(x=np.arange(0,1,0.01), y=np.arange(0,1,0.01), name='Random clf',
                         line = dict(color='grey', width=4, dash='dot')))
fig.show()

In [97]:
# Calculate the ROC AUC, i.e. the area under the ROC curve.
# Note that the ROC AUC score of a random classifier is 0.5,
# a perfect classifier has a score of 1.
roc_auc_score(targets, bert_preds)

0.8930086387914147