In [5]:
import numpy as np
import pandas as pd
import json
from utils import *
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

In [None]:
truth_df = pd.DataFrame(columns=['id','truthMedian','truthClass','truthMean'])
with open('data/truth.jsonl') as data:
    for labelobj in data:
        truth = json.loads(labelobj)
        truthlabel = {'id': truth['id'], 'truthMedian': truth['truthMedian'], 'truthClass': truth['truthClass'], 'truthMean': truth['truthMean']}
        truth_df = truth_df.append(truthlabel, ignore_index = True)
truth_df.head()  

In [None]:
instances_df = pd.DataFrame(columns=['id','postText'])
with open('data/instances.jsonl') as data:
	for instanceobj in data:
		instance = json.loads(instanceobj)
		instancerow = {'id': instance['id'], 'postText': instance['postText']}
		instances_df = instances_df.append(instancerow, ignore_index=True)
instances_df.head()

In [None]:
dataset = instances_df.join(truth_df.set_index('id'), on='id')
dataset = dataset.drop(labels='id',axis=1)
for i in range(len(dataset)):
    dataset['postText'].values[i] = dataset['postText'].values[i][0]
dataset['postText'].dropna(inplace=True)
dataset.head()

In [None]:
# Convert the 'no-clickbait' or 'clickbait' to binary indicator
dataset['truthClass'] = dataset['truthClass'].apply(classToBinary)
# Convert floating number in 'truthMedian' column to integer
dataset['truthMedian'] = dataset['truthMedian'].apply(medianToInteger)
# Remove all punctuations and clear the text
dataset['postText'] = dataset['postText'].apply(cleanText)
dataset.head()

In [None]:
# split the dataset to training and testing set
train, test = train_test_split(dataset, test_size=0.2)
X_train, Y_train = np.array(train["postText"].tolist()), np.array(train["truthClass"].tolist())
X_test, Y_test = np.array(test["postText"].tolist()), np.array(test["truthClass"].tolist())
print(X_train.shape)
print(X_test.shape)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer
import matplotlib.pyplot as plt

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'

map_name_to_handle = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
}

map_model_to_preprocess = {
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1'
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
def Bert_Model():
    text_input = Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = Dropout(0.1)(net)
    net = Dense(1, activation=None, name='classifier')(net)
    return Model(text_input, net)

In [None]:
bert_model = Bert_Model()
bert_model.summary()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
epochs = 5
steps_per_epoch = X_train.shape[0]
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# classifier_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = classifier_model.fit(X_train, Y_train, validation_data=val_ds, epochs=epochs)
# history = classifier_model.fit(X_train, Y_train, epochs = 5, batch_size = 64, shuffle=True)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score, mean_squared_error,classification_report
loss, accuracy = classifier_model.evaluate(X_test, Y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')
Y_predict = classifier_model.predict(X_test)
Y_predict_binary = [1 if i >= 0.5 else 0 for [i] in Y_predict]
print(classification_report(Y_test, Y_predict_binary))

In [None]:
# history_dict = history.history
# print(history_dict.keys())

# acc = history_dict['binary_accuracy']
# val_acc = history_dict['val_binary_accuracy']
# loss = history_dict['loss']
# val_loss = history_dict['val_loss']

# epochs = range(1, len(acc) + 1)
# fig = plt.figure(figsize=(10, 6))
# fig.tight_layout()

# plt.subplot(2, 1, 1)
# # "bo" is for "blue dot"
# plt.plot(epochs, loss, 'r', label='Training loss')
# # b is for "solid blue line"
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
# plt.title('Training and validation loss')
# # plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()

# plt.subplot(2, 1, 2)
# plt.plot(epochs, acc, 'r', label='Training acc')
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='lower right')