In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install -q tensorflow-text

In [2]:
!pip install -q tf-models-official

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer
import matplotlib.pyplot as plt
import pandas as pd
import json
from utils import *
from sklearn.model_selection import train_test_split


 The versions of TensorFlow you are currently using is 2.4.0-rc4 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [4]:
train_df = pd.DataFrame(columns=['id','postText','truthMedian','truthClass','truthMean'])
with open('data/train.json') as json_data:
    data = json.load(json_data)
    for instance in data:
        train_instance = {'id': instance['text_id'], 'postText': instance['post_text'], 'truthMedian': instance['truth_median'], 'truthClass': instance['click_bait'], 'truthMean': instance['truth_mean']}
        train_df = train_df.append(train_instance, ignore_index = True)
train_df.head()

Unnamed: 0,id,postText,truthMedian,truthClass,truthMean
0,849630396191055872,"Flying mattress strikes biker at 50 mph, ends ...",0.0,0,0.2
1,805003400136036352,The Grand Tour: Jeremy Clarkson is really taun...,0.0,0,0.2
2,811134324628226048,Was this murderous attack on a Russian diploma...,0.666667,1,0.533333
3,807681539580502016,"PartyNextDoor brings out Drake, G-Eazy and mor...",0.0,0,0.266667
4,807465142032232450,A far-right Dutch lawmaker who wants to Make t...,0.0,0,0.2


In [5]:
test_df = pd.DataFrame(columns=['id','postText','truthMedian','truthClass','truthMean'])
with open('data/test.json') as json_data:
    data = json.load(json_data)
    for instance in data:
        test_instance = {'id': instance['text_id'], 'postText': instance['post_text'], 'truthMedian': instance['truth_median'], 'truthClass': instance['click_bait'], 'truthMean': instance['truth_mean']}
        test_df = test_df.append(test_instance, ignore_index = True)
test_df.head()

Unnamed: 0,id,postText,truthMedian,truthClass,truthMean
0,841383755944189952,"John Lewis blasts GOP rep's ""bigoted and racis...",0.0,0,0.2
1,852354010820620288,The NBA playoff bracket is now set,0.0,0,0.2
2,838728651545903104,Migrants smash up German asylum centre and sev...,0.0,0,0.133333
3,805465449739718656,"America's most livable states, ranked",0.666667,1,0.666667
4,841713776382627840,WIP or RIP?,1.0,1,0.933333


In [6]:
# Remove all punctuations and clear the text
train_df['postText'] = train_df['postText'].apply(cleanText)
train_df.head()

Unnamed: 0,id,postText,truthMedian,truthClass,truthMean
0,849630396191055872,flying mattress strikes biker at 50 mph ends u...,0.0,0,0.2
1,805003400136036352,the grand tour jeremy clarkson is really taunt...,0.0,0,0.2
2,811134324628226048,was this murderous attack on a russian diploma...,0.666667,1,0.533333
3,807681539580502016,partynextdoor brings out drake geazy and more ...,0.0,0,0.266667
4,807465142032232450,a farright dutch lawmaker who wants to make th...,0.0,0,0.2


In [7]:
# Remove all punctuations and clear the text
test_df['postText'] = test_df['postText'].apply(cleanText)
test_df.head()

Unnamed: 0,id,postText,truthMedian,truthClass,truthMean
0,841383755944189952,john lewis blasts gop reps bigoted and racist ...,0.0,0,0.2
1,852354010820620288,the nba playoff bracket is now set,0.0,0,0.2
2,838728651545903104,migrants smash up german asylum centre and sev...,0.0,0,0.133333
3,805465449739718656,americas most livable states ranked,0.666667,1,0.666667
4,841713776382627840,wip or rip,1.0,1,0.933333


In [8]:
X_train, Y_train = np.array(train_df["postText"].tolist()), np.array(train_df["truthClass"].tolist())
X_test, Y_test = np.array(test_df["postText"].tolist()), np.array(test_df["truthClass"].tolist())
print(X_train.shape)
print(X_test.shape)

(15567,)
(3892,)


In [9]:
print(X_test[0:10])
print(Y_test[0:10])

['john lewis blasts gop reps bigoted and racist comments'
 'the nba playoff bracket is now set'
 'migrants smash up german asylum centre and severely injure a police officer after riot breaks out'
 'americas most livable states ranked' 'wip or rip'
 'trump tries to reassure supporters after health care humiliation'
 'theyre filming the love actually reunion already and the turtlenecks are back'
 'trump team hints at scorched earth policy toward climate science'
 'steve sarkisian hired as alabama offensive coordinator and qb coach'
 'all hail who went from mixtape mc to mastermind with 2 back to back albums']
[0 0 0 1 1 0 0 0 0 0]


In [10]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
}

map_model_to_preprocess = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1


In [11]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [12]:
# text_test = ['this is such an amazing movie!']
# text_preprocessed = bert_preprocess_model(text_test)

# print(f'Keys       : {list(text_preprocessed.keys())}')
# print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
# print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
# print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
# print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [13]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [14]:
# bert_results = bert_model(text_preprocessed)

# print(f'Loaded BERT: {tfhub_handle_encoder}')
# print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
# print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
# print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
# print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

In [21]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
def build_classifier_model():
    text_input = Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = Dropout(0.1)(net)
    net = Dense(1, activation=None, name='classifier')(net)
    return Model(text_input, net)

In [22]:
bert_classifier = build_classifier_model()









In [23]:
bert_classifier.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'encoder_outputs':  28763649    preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
____________________________________________________________________________________________

In [24]:
steps_per_epoch = X_train.shape[0]
epochs = 5
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-4
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [25]:
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [26]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [get_f1,  'accuracy']
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [27]:
bert_classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [28]:
print(f'Training model with {tfhub_handle_encoder}')
history = bert_classifier.fit(X_train, Y_train, validation_split=0.2, epochs=10, batch_size=64, callbacks=[callback], shuffle=True)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/10


InvalidArgumentError:  Trying to access resource using the wrong type. Expected class tensorflow::lookup::LookupInterface got class tensorflow::lookup::LookupInterface
	 [[{{node model_1/preprocessing/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/bert_tokenizer/StatefulPartitionedCall/WordpieceTokenizeWithOffsets/WordpieceTokenizeWithOffsets/WordpieceTokenizeWithOffsets}}]] [Op:__inference_train_function_104461]

Function call stack:
train_function


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
loss, get_f1, accuracy = bert_classifier.evaluate(X_test, Y_test)
print(f'Loss: {loss}')
print(f'get_f1: {get_f1}')
print(f'Accuracy: {accuracy}')
Y_predict = bert_classifier.predict(X_test)
Y_predict_binary = [1 if i >= 0.5 else 0 for [i] in Y_predict]
print("Bert Testing report")
print(classification_report(Y_test, Y_predict_binary))


In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['get_f1']
val_loss = history_dict['val_get_f1']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training F1')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation F1')
plt.title('Training and validation F1')
# plt.xlabel('Epochs')
plt.ylabel('F1')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
bert_classifier.save("BERT_model.h5")