(based on https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22)

# Kaggle Disaster Tweets Challenge
## BERT Embeddings with TensorFlow 2.0 + Random Forest

https://www.kaggle.com/c/nlp-getting-started

With the new release of TensorFlow, this Notebook aims to show a simple use of the BERT model.
- See BERT on paper: https://arxiv.org/pdf/1810.04805.pdf
- See BERT on GitHub: https://github.com/google-research/bert
- See BERT on TensorHub: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
- See 'old' use of BERT for comparison: https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

## Update TF
We need Tensorflow 2.0 and TensorHub 0.7 for this Colab

In [2]:
#!pip install tensorflow==2.0
#!pip install tensorflow_hub==0.7
#!pip install bert-for-tf2
#!pip install sentencepiece
#!pip install pandas



In [3]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.0.0
Hub version:  0.7.0


If TensorFlow Hub is not 0.7 yet on release, use dev:



In [3]:
### !pip install tf-hub-nightly

[33mYou are using pip version 19.0.3, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
# hub.__version__

'0.8.0.dev'

## Import modules

In [1]:
import tensorflow_hub as hub
import tensorflow as tf
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math

Building model using tf.keras and hub. from sentences to embeddings.

Inputs:
 - input token ids (tokenizer converts tokens using vocab file)
 - input masks (1 for useful tokens, 0 for padding)
 - segment ids (for 2 text training: 0 for the first one, 1 for the second one)

Outputs:
 - pooled_output of shape `[batch_size, 768]` with representations for the entire input sequences
 - sequence_output of shape `[batch_size, max_seq_length, 768]` with representations for each input token (in context)

In [2]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [3]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

Generating segments and masks based on the original BERT

In [4]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

Import tokenizer using the original vocab file

In [5]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

## Prepare data

In [6]:
import pandas as pd

In [7]:
train = pd.read_csv("./data/input/train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
len(train)

7613

In [9]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    return tokens


In [10]:
def process_tokens(text):
    #TODO tags to separate column
    #TODO strip hash from tags in text
    return text

In [14]:
def vectorize(text):
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    
    input_ids = get_ids(tokens, tokenizer, max_seq_length)
    input_masks = get_masks(tokens, max_seq_length)
    input_segments = get_segments(tokens, max_seq_length)
    
    #print('tokens')
    #print(tokens)
    #print('input_ids')
    #print(input_ids)
    #print('input_masks')
    #print(input_masks)
    #print('input_segments')
    #print(input_segments)
    
    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    
    #print('pool_embs')
    #print(pool_embs)
    
    return pool_embs[0]
    
    

In [15]:
#train["tokenized"] = train["text"].map(tokenize)
train_small = train.sample(50)

In [16]:
train_small["vectorized"] = train_small["text"].map(vectorize)

In [17]:
train_small

Unnamed: 0,id,keyword,location,text,target,vectorized
1866,2682,crush,,Seriously have the biggest girl crush ever on ...,0,"[-0.8701796, -0.2937034, 0.064057834, 0.674443..."
3923,5579,flood,New York,12' 72W CREE LED Work Light Bar Alloy Spot Flo...,0,"[-0.7649183, -0.6727237, -0.97098035, 0.736852..."
765,1106,blew%20up,Florida,@iphooey @TIME Ironically Michele Bachmann bro...,0,"[-0.714161, -0.29003528, -0.53335494, 0.405214..."
6408,9161,suicide%20bomber,"19.600858, -99.047821",Mosque bombing strikes Saudi special forces; a...,1,"[-0.7877648, -0.5819566, -0.9204984, 0.6665516..."
5082,7248,nuclear%20disaster,Austin TX,Alarming Rise in Dead Marine Life Since the #F...,1,"[-0.9089203, -0.743991, -0.99118143, 0.8479957..."
7040,10087,typhoon,"Wilmington, Delaware",Map: Typhoon Soudelor's predicted path as it a...,1,"[-0.7030923, -0.50600356, -0.9014137, 0.522852..."
2584,3708,destroyed,,@alanhahn @HDumpty39 Daughtery would get destr...,0,"[-0.6311881, -0.42598, -0.86701566, 0.41932264..."
3480,4975,explosion,S.F. Bay area,MORE--&gt;OSHA officers on siteinvestigating N...,1,"[-0.8061329, -0.5781096, -0.9546388, 0.6799465..."
1625,2348,collapse,"Pompano Beach, FL",Growth dries up for BHP Billiton as oil price ...,0,"[-0.8894482, -0.6992881, -0.9840752, 0.8223380..."
2137,3068,deaths,"Atlanta, GA",Hear @DrFriedenCDC talk on how to avoid thousa...,0,"[-0.7241185, -0.5487765, -0.9166135, 0.5508809..."


In [18]:
X = train_small["vectorized"]

In [19]:
y = train_small["target"]

## Train classifier

In [50]:
!pip install sklearn

Processing /Users/ivp/Library/Caches/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074/sklearn-0.0-py2.py3-none-any.whl
Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/83/ff/d8e912e96aa47abc4ffb02bb3d05eaee45c14b74d02f0abf22b97d83a888/scikit_learn-0.22-cp36-cp36m-macosx_10_6_intel.whl (11.1MB)
[K     |████████████████████████████████| 11.1MB 1.9MB/s eta 0:00:01
Collecting joblib>=0.11
  Using cached https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl
Collecting scipy>=0.17.0
[?25l  Downloading https://files.pythonhosted.org/packages/96/94/cd76305a69fff844e83655ed7b254835df4eddd5fc0e2d0eb2914501b36e/scipy-1.4.1-cp36-cp36m-macosx_10_6_intel.whl (28.5MB)
[K     |████████████████████████████████| 28.5MB 971kB/s eta 0:00:01    |█▋                              | 1.5MB 1.1MB/s eta 0:00:24     |██████████▌                     | 9.3MB 1.9MB/s eta 

In [21]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [22]:
np.random.seed(0)

In [23]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42) #TODO params

In [158]:
num_features = len(X.values[0])
num_observations = len(X.values)

In [24]:
X1 = np.vstack(X.values)

In [25]:
y1 = np.array(y)

(50,)

In [26]:
clf.fit(X1, y1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [27]:
from sklearn.metrics import classification_report

classification_report(y1, clf.predict(X1))

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        24\n           1       1.00      1.00      1.00        26\n\n    accuracy                           1.00        50\n   macro avg       1.00      1.00      1.00        50\nweighted avg       1.00      1.00      1.00        50\n'