BERT TF script, Refactored:

In [None]:
# Progress Counter
import sys

class progressCounter():

  def __init__(self, num_iterations, name=''):
    self.progress = 0
    self.N = num_iterations
    self.name = name


  def check_pt(self, i):
    curr_progress = int(i/(self.N-1) * 100)
    if curr_progress - self.progress > 0:
      self.progress = curr_progress
      sys.stdout.write("\r{1} Progress: {0}%".format(self.progress, self.name))
      sys.stdout.flush()

In [None]:
# Instantiate BERT from Hub
import tensorflow as tf
import tensorflow_hub as hub

max_seq_length = 512  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")

bert_inputs = [input_word_ids, input_mask, segment_ids]

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2",
                            trainable=True, name='bert_layer')
pooled_output, sequence_output = bert_layer(bert_inputs)


In [None]:
# Import and Build BERT Tokenizer

!pip install tf-models-official
import official.nlp.bert.tokenization as tokenization
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

import numpy as np
# Convert To BERT Tokens
def convertToBERTInputData(sentence):
  sent_tokenized = tokenizer.tokenize(sentence)
  sent_length = len(sent_tokenized)
  if sent_length > 510:
    sent_tokenized = sent_tokenized[0:510]
    sent_length = 510

  sent_tokenized.insert(0,'[CLS]')
  sent_tokenized.append('[SEP]')
  sent_length+=2

  sent_ids = tokenizer.convert_tokens_to_ids(sent_tokenized) + [0] * (512 - sent_length)
  sent_mask = [1]*sent_length + [0]*(512 - sent_length)
  sent_seg_ids =  [0] * 512

  sent_ids = np.array(sent_ids, dtype=np.int32)
  sent_mask = np.array(sent_mask, dtype=np.int32)
  sent_seg_ids = np.array(sent_seg_ids, dtype=np.int32)

  return ( sent_ids, sent_mask, sent_seg_ids )

Collecting tf-models-official
[?25l  Downloading https://files.pythonhosted.org/packages/5b/33/91e5e90e3e96292717245d3fe87eb3b35b07c8a2113f2da7f482040facdb/tf_models_official-2.3.0-py2.py3-none-any.whl (840kB)
[K     |▍                               | 10kB 24.0MB/s eta 0:00:01[K     |▉                               | 20kB 6.2MB/s eta 0:00:01[K     |█▏                              | 30kB 7.6MB/s eta 0:00:01[K     |█▋                              | 40kB 8.4MB/s eta 0:00:01[K     |██                              | 51kB 6.9MB/s eta 0:00:01[K     |██▍                             | 61kB 7.4MB/s eta 0:00:01[K     |██▊                             | 71kB 8.5MB/s eta 0:00:01[K     |███▏                            | 81kB 8.7MB/s eta 0:00:01[K     |███▌                            | 92kB 8.2MB/s eta 0:00:01[K     |████                            | 102kB 8.4MB/s eta 0:00:01[K     |████▎                           | 112kB 8.4MB/s eta 0:00:01[K     |████▊                       

Script Begin:

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False)
newsgroups_test = fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False)

Downloading 20news dataset. This may take a few minutes.
INFO:sklearn.datasets._twenty_newsgroups:Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
INFO:sklearn.datasets._twenty_newsgroups:Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Train Data:

In [None]:
data = newsgroups_train.data
targets = newsgroups_train.target
categories = newsgroups_train.target_names

In [None]:
# Create Feature Inputs -- X
data_len = len(data)
pC = progressCounter(data_len)
bert_inputs_x1 = [0]*data_len
bert_inputs_x2 = [0]*data_len
bert_inputs_x3 = [0]*data_len

for i in range(data_len):
  temp = convertToBERTInputData(data[i])
  bert_inputs_x1[i] = temp[0]
  bert_inputs_x2[i] = temp[1]
  bert_inputs_x3[i] = temp[2]
  pC.check_pt(i)


 Progress: 100%

In [None]:
import numpy as np
# Create Feature Outputs -- Y
data_len = len(targets)
expected_output = [0] * data_len
for i in range(data_len):
  expected_output[i] = [0]*20

for i in range(data_len):
  index = targets[i]
  expected_output[i][index] = 1
  expected_output[i] = np.array(expected_output[i], dtype=np.int32)

expected_output_y = expected_output # <----------- y value

Create TF -- BERT Model:

In [None]:
from official import nlp

classifier_layer_1 = tf.keras.layers.Dense(512, name='classification_layer_1',activation='relu')(pooled_output)
classifier_layer_2 = tf.keras.layers.Dense(256, name='classification_layer_2',activation='relu')(classifier_layer_1)
predictions = tf.keras.layers.Dense(20, name='classification_layer_3',activation='sigmoid')(classifier_layer_2)

model = tf.keras.models.Model(inputs=bert_inputs, outputs=predictions, name='Bert_Tensorflow')
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='categorical_accuracy') #add from logits


In [None]:
model.summary()

Model: "Bert_Tensorflow"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert_layer (KerasLayer)         [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]   

In [None]:
expected_output_y = np.array(expected_output_y, dtype = np.int32)
expected_output_y = np.reshape(expected_output_y,(11314,20))
print(expected_output_y.shape)

bert_inputs_x1 = np.array(bert_inputs_x1,dtype=np.int32)
bert_inputs_x1 = np.reshape(bert_inputs_x1,(11314,512))

bert_inputs_x2 = np.array(bert_inputs_x2,dtype=np.int32)
bert_inputs_x2 = np.reshape(bert_inputs_x2,(11314,512))

bert_inputs_x3 = np.array(bert_inputs_x3,dtype=np.int32)
bert_inputs_x3 = np.reshape(bert_inputs_x3,(11314,512))

print(bert_inputs_x1.shape)

(11314, 20)
(11314, 512)


In [None]:
model.fit({'input_word_ids' : bert_inputs_x1, 'input_mask' : bert_inputs_x2 , 'segment_ids' : bert_inputs_x3 }, {'classification_layer_3' : expected_output_y}, verbose=1, batch_size=1)

