In [2]:
import os
# os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [3]:
import joblib
from tensorflow import keras
import json
from typing import List, Tuple
from transformers import AutoTokenizer, BertTokenizer, TFBertForTokenClassification, BertConfig
import tensorflow as tf
from src.training import bert_trainer
import numpy as np

In [4]:
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Load the Tokenizer and Dataset**

In [5]:
# Load pre-trained model tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

# Load files
with open('resources/bert/data/ner_dataset.json', 'r') as file:
	ner_dataset_json = json.load(file)

with open('resources/bert/data/ner_lookup.json', 'r') as file:
	ner_lookup = json.load(file)

**Convert dataset into IOB tagged data**

In [6]:
iob_data = bert_trainer.convert_to_IOB(ner_dataset_json, ner_lookup)

In [7]:
iob_data[2]

('give me a hot chocolate no whipped topping and a grilled cheese sandwich',
 ['O',
  'O',
  'O',
  'B-beverage',
  'I-beverage',
  'O',
  'B-beverage_modifier',
  'I-beverage_modifier',
  'O',
  'O',
  'B-food',
  'I-food',
  'O'])

**Tokenization and Handling Subword Tokens**

In [8]:
final_data = []
for sentence, tags in iob_data:
	final_data.append(bert_trainer.align_tokens_and_tags(sentence,tags, tokenizer))

In [9]:
final_data[0]

[('i', 'O', 1045),
 ('ordered', 'O', 3641),
 ('a', 'O', 1037),
 ('medium', 'B-beverage_size', 5396),
 ('double', 'B-beverage_modifier', 3313),
 ('double', 'I-beverage_modifier', 3313),
 ('and', 'O', 1998),
 ('a', 'O', 1037),
 ('boston', 'O', 3731),
 ('cream', 'O', 6949),
 ('don', 'B-food', 2123),
 ('##ut', 'I-food', 4904),
 ('from', 'O', 2013),
 ('tim', 'O', 5199),
 ('horton', 'O', 18469),
 ('##s', 'O', 2015)]

**Preprocessing**

In [10]:
max_len=0
for sent in final_data:
    max_len = len(sent) if len(sent)>max_len else max_len

max_len

26

In [11]:
prepped_data = bert_trainer.preprocess_for_training(final_data, max_seq_length=max_len)

In [12]:
for i in prepped_data.take(1):
    print(i[1])

tf.Tensor(
[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 1. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 1. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]

In [13]:
len(final_data[3])

17

In [14]:
for i in prepped_data.take(1):
	print(i)

({'input_ids': <tf.Tensor: shape=(32, 26), dtype=int32, numpy=
array([[ 2064,  1045,  2031,  1037,  2235,  2137,  2080,  5699, 23301,
         1998,  1037,  1038,  7096,  2007,  4469, 11611,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 2507,  2033,  1037,  2235,  9587,  7507,  2007, 12428, 22286,
         1998,  5199, 16313,  2015,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 1996,  2235,  2413, 21161,  2001,  2205,  4086,  1998,  1996,
         6090,  5498,  2001,  4394, 12760,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 1996,  4469,  2312,  9587,  7507,  1045,  3641,  2001,  2025,
         2980,  2438,  1998,  1996,  4524,  2884,  2001,  4394, 20856,
            0,     0,     0,     0,     0,     0,     0,     0],
       [ 2026,  2312,  2474,  4674,  2001,  2205,  2980,  1998,  1996,
        14163, 15379,  2347,  2102, 15

In [15]:
prepped_data

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 26, 15), dtype=tf.float32, name=None))>

**Train-Test Split**

In [16]:
total_size = 0
for _ in prepped_data:
    total_size += 1

train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

train_dataset = prepped_data.take(train_size+val_size)
val_dataset = train_dataset.skip(train_size)
test_dataset = prepped_data.skip(train_size)

In [17]:
train_size, val_size, test_size

(21, 7, 7)

**Training**

In [None]:
num_labels = len(set([tag for sentence in final_data for _, tag, _ in sentence]))
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', config=config)

In [19]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = [keras.metrics.Precision(), keras.metrics.Recall(), bert_trainer.F1Score()]

In [20]:
callbacks = [
    #keras.callbacks.ModelCheckpoint(filepath='resources/bert/checkpoints/model.{epoch:02d}-{val_loss:.2f}.h5', save_weights_only=True),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
]

In [21]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [22]:
model.layers[-1]

<keras.layers.core.dense.Dense at 0x1fd0ed426a0>

In [None]:
history_round_1 = model.fit(train_dataset, epochs=1, validation_data=val_dataset, callbacks=callbacks, verbose=2)

**For Reloading/Saving**

In [32]:
prepped_data.save('resources/bert/data/prepped_data')

In [3]:
prepped_data = tf.data.Dataset.load('resources/bert/data/prepped_data')