# 0 - Loading

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import datetime

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [2]:
f = open("preprocessed_text.json", mode = "r", encoding ="utf-8")
d = json.load(f)
f.close()

d = pd.DataFrame(d)

In [3]:
make_into_sentences = d["premises"]

In [4]:
sentences = []

for premise in tqdm(make_into_sentences):
    temp = " ".join(premise)
    sentences.append(temp)

100%|██████████████████████████████████████████████████████████████████████| 387692/387692 [00:02<00:00, 152263.51it/s]


In [5]:
sentences = pd.Series(sentences, name = "premises")
d = d.drop(["premises"], axis = 1)
result = pd.concat([d, sentences], axis = 1)

In [7]:
result.head()

Unnamed: 0,stances,premises
0,0,opponent forfeited every round none argument a...
1,0,propose school fund program condom cost money ...
2,0,school compelling interest providing contracep...
3,1,senior school group focusing teenage pregnancy...
4,0,resolution used pro assumes australia isnt alr...


# 1 - Data Handling

In [2]:
bert_model_name="uncased_L-12_H-768_A-12"

bert_ckpt_dir = os.path.join("BERT/", bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

NameError: name 'os' is not defined

In [None]:
class SentimentAnalysisData:
    DATA_COLUMN = "premises"
    LABEL_COLUMN = "stances"

    def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=128):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.classes = classes

        train, test = map(lambda result: result.reindex(result[SentimentAnalysisData.DATA_COLUMN].str.len().sort_values().index), [train, test])

        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, result):
        x, y = [], []
    
        for _, row in tqdm(result.iterrows()):
            text, label = row[SentimentAnalysisData.DATA_COLUMN], row[SentimentAnalysisData.LABEL_COLUMN]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            x.append(token_ids)
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)

    def _pad(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)

In [None]:
def create_model(max_seq_len, bert_ckpt_file):

    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = None
        bert = BertModelLayer.from_params(bert_params, name="bert")
        
    input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
    bert_output = bert(input_ids)

    #print("bert shape", bert_output.shape)

    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="relu")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    load_stock_weights(bert, bert_ckpt_file)
        
    return model

# 2 - Training

In [None]:
classes = [0, 1]

In [None]:
data = SentimentAnalysisData(train.iloc[0:50000], test.iloc[0:50000], tokenizer, classes, max_seq_len=128)

In [None]:
data.train_x.shape

In [None]:
model = create_model(data.max_seq_len, bert_ckpt_file)

In [None]:
model.summary()

In [None]:
model.compile(
  optimizer=keras.optimizers.Adam(1e-5),
  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [None]:
#log_dir = "BERT/log/" + "test"#datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
#tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=data.train_x, 
  y=data.train_y,
  validation_split=0.1,
  batch_size=16,
  shuffle=True,
  epochs=5,
  #callbacks=[tensorboard_callback]
)