### Training BERT for Entity Recognition

In [1]:
import os
import pickle
#os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [2]:
import joblib
from tensorflow import keras
import json
from transformers import AutoTokenizer, BertTokenizer, TFBertForTokenClassification, BertConfig
import tensorflow as tf
from all_legacy_code.src.preprocess import ner_prep
from all_legacy_code.src import models
from all_legacy_code.src.training_and_prediction import predict
import numpy as np

In [3]:
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Load the Tokenizer and Dataset**

In [4]:
# Load pre-trained model tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

# Load files
with open('resources/bert/data/ner_dataset.json', 'r') as file:
	ner_dataset_json = json.load(file)

with open('resources/bert/data/ner_lookup.json', 'r') as file:
	ner_lookup = json.load(file)

**Convert dataset into IOB tagged data**

In [5]:
iob_data = ner_prep.convert_to_IOB(ner_dataset_json, ner_lookup)

In [6]:
iob_data[2]

('give me a hot chocolate no whipped topping and a grilled cheese sandwich',
 ['O',
  'O',
  'O',
  'B-beverage',
  'I-beverage',
  'O',
  'B-beverage_modifier',
  'I-beverage_modifier',
  'O',
  'O',
  'B-food',
  'I-food',
  'O'])

**Tokenization and Handling Subword Tokens**

In [7]:
final_data = []
for sentence, tags in iob_data:
	final_data.append(ner_prep.align_tokens_and_tags(sentence, tags, tokenizer))

In [8]:
final_data[0]

[('i', 'O', 1045),
 ('ordered', 'O', 3641),
 ('a', 'O', 1037),
 ('medium', 'B-beverage_size', 5396),
 ('double', 'B-beverage_modifier', 3313),
 ('double', 'I-beverage_modifier', 3313),
 ('and', 'O', 1998),
 ('a', 'O', 1037),
 ('boston', 'O', 3731),
 ('cream', 'O', 6949),
 ('don', 'B-food', 2123),
 ('##ut', 'I-food', 4904),
 ('from', 'O', 2013),
 ('tim', 'O', 5199),
 ('horton', 'O', 18469),
 ('##s', 'O', 2015)]

**Preprocessing**

In [9]:
max_len=0
for sent in final_data:
    max_len = len(sent) if len(sent)>max_len else max_len

max_len

26

In [10]:
prepped_data, label_map = ner_prep.preprocess_for_training(final_data, max_seq_length=max_len)

In [11]:
label_map

{'B-beverage': 0,
 'B-beverage_complain': 1,
 'B-beverage_modifier': 2,
 'B-beverage_size': 3,
 'B-food': 4,
 'B-food_complain': 5,
 'B-food_modifier': 6,
 'I-beverage': 7,
 'I-beverage_complain': 8,
 'I-beverage_modifier': 9,
 'I-beverage_size': 10,
 'I-food': 11,
 'I-food_complain': 12,
 'I-food_modifier': 13,
 'O': 14,
 -100: -100}

In [12]:
for i in prepped_data.take(1):
    print(np.argmax(i[1], axis=2))

[[11 11 11  5  6 10 11 11 11 11  2  1 11 11 11 11  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  3  9 11  6 10 11 11 14  7  2  1 11 14  0  0  0  0  0  0  0
   0  0]
 [11 11 11  3  9 11  6 10 11 11  2  1  1 11  0  0  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  6 10  3 11  6 10 11 11  2  1 11 14  7  0  0  0  0  0  0  0
   0  0]
 [11 11 11 11 11  3  9 11 11 11 11 11 11  2  1  1 11 14  7  0  0  0  0  0
   0  0]
 [11 11 11  5  3  9 11 11 11 11  2  1 11 14  7  0  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  3  9 11  6 10 11  2  1  1  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  3  9  9 11 11  2  1 11 11 14  7  0  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5 11  3  9  9  9 11 11  2  1 11 14  7  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  3  9 11  6 10 11 11 14  7  2  1 11 14  0  0  0  0  0  0  0
   0  0]
 [11 11 11  3  9 11  6 10 11 11  2  1  1 11  0  0  0  0  0  0  0  0  0  0
   0  0]
 [11 11 11  5  6 10  3 11  6 10 11 11  2  1 11 14  7  0  0  0  0  0  0  0
   0  0]
 [11

In [13]:
prepped_data

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 26, 15), dtype=tf.float32, name=None))>

**Train-Test Split**

In [16]:
total_size = 0
for _ in prepped_data:
    total_size += 1

train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

train_dataset = prepped_data.take(train_size)
val_dataset = prepped_data.skip(train_size).take(val_size)
test_dataset = prepped_data.skip(train_size+val_size)

**Training**

In [None]:
model = models.create_entity_classifier(compile=True, lr=5e-5)

In [29]:
callbacks = [
    # keras.callbacks.ModelCheckpoint(filepath='resources\\bert\\checkpoints\\model.{epoch:02d}-{val_loss:.2f}.h5', save_weights_only=True),
    keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=7)
]

In [30]:
history_round_1 = model.fit(train_dataset, epochs=9, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9


**Evaluation**

In [31]:
loss, precision, recall, f1_score = model.evaluate(test_dataset)
print(f'Precsion: {precision}\nRecall: {recall}\nF1 Score: {f1_score}')

Precsion: 0.9130523800849915
Recall: 0.9995520710945129
F1 Score: 0.9543456435203552


In [36]:
predict.predict_entities('the iced capp was too dry', model, tokenizer, label_map, 26)



[('the', 'O'),
 ('iced', 'B-beverage'),
 ('cap', 'I-beverage'),
 ('##p', 'I-beverage'),
 ('was', 'O'),
 ('too', 'B-food_complain'),
 ('dry', 'I-food_complain')]

**For Reloading/Saving**

In [19]:
# LOAD
prepped_data = tf.data.Dataset.load('resources/bert/data/prepped_data')
with open('resources/bert/data/label_map.pkl', 'rb') as f:
    label_map = pickle.load(f)
with open('resources/bert/saved/ner_tokenizer.pkl', 'wb') as f:
    tokenizer = pickle.load(f)
model.load_weights('resources/bert/saved/ner_trained_weights.h5')

In [37]:
# SAVE
model.save_weights('resources/bert/saved/ner_trained_weights.h5')
prepped_data.save('resources/bert/data/prepped_data')
with open('resources/bert/data/label_map.pkl', 'wb') as f:
    pickle.dump(label_map, f)
with open('resources/bert/saved/ner_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)