In [1]:
import os
import pickle
os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [2]:
import joblib
from tensorflow import keras
import json
from typing import List, Tuple
from transformers import AutoTokenizer, BertTokenizer, TFBertForTokenClassification, BertConfig
import tensorflow as tf
from src.training import ner_prep
import numpy as np

In [4]:
os.chdir('C:\\Users\\thory\\PycharmProjects\\chatopotamus')

**Load the Tokenizer and Dataset**

In [8]:
# Load pre-trained model tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

# Load files
with open('resources/bert/data/ner_dataset.json', 'r') as file:
	ner_dataset_json = json.load(file)

with open('resources/bert/data/ner_lookup.json', 'r') as file:
	ner_lookup = json.load(file)

**Convert dataset into IOB tagged data**

In [9]:
iob_data = ner_prep.convert_to_IOB(ner_dataset_json, ner_lookup)

In [10]:
iob_data[2]

('give me a hot chocolate no whipped topping and a grilled cheese sandwich',
 ['O',
  'O',
  'O',
  'B-beverage',
  'I-beverage',
  'O',
  'B-beverage_modifier',
  'I-beverage_modifier',
  'O',
  'O',
  'B-food',
  'I-food',
  'O'])

**Tokenization and Handling Subword Tokens**

In [11]:
final_data = []
for sentence, tags in iob_data:
	final_data.append(ner_prep.align_tokens_and_tags(sentence,tags, tokenizer))

In [12]:
final_data[0]

[('i', 'O', 1045),
 ('ordered', 'O', 3641),
 ('a', 'O', 1037),
 ('medium', 'B-beverage_size', 5396),
 ('double', 'B-beverage_modifier', 3313),
 ('double', 'I-beverage_modifier', 3313),
 ('and', 'O', 1998),
 ('a', 'O', 1037),
 ('boston', 'O', 3731),
 ('cream', 'O', 6949),
 ('don', 'B-food', 2123),
 ('##ut', 'I-food', 4904),
 ('from', 'O', 2013),
 ('tim', 'O', 5199),
 ('horton', 'O', 18469),
 ('##s', 'O', 2015)]

**Preprocessing**

In [13]:
max_len=0
for sent in final_data:
    max_len = len(sent) if len(sent)>max_len else max_len

max_len

26

In [14]:
prepped_data, label_map = ner_prep.preprocess_for_training(final_data, max_seq_length=max_len)

In [14]:
label_map

{'B-food': 0,
 'B-beverage_complain': 1,
 'B-beverage_size': 2,
 'B-beverage': 3,
 'I-beverage_size': 4,
 'I-beverage_complain': 5,
 'I-food_modifier': 6,
 'B-food_complain': 7,
 'B-beverage_modifier': 8,
 'I-food': 9,
 'O': 10,
 'B-food_modifier': 11,
 'I-food_complain': 12,
 'I-beverage_modifier': 13,
 'I-beverage': 14,
 -100: -100}

In [13]:
for i in prepped_data.take(1):
    print(np.argmax(i[1], axis=2))

[[10 10 10  2  8 13 10 10 10 10  0  9 10 10 10 10  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  3 14 10  8 13 10 10 11  6  0  9 10 11  0  0  0  0  0  0  0
   0  0]
 [10 10 10  3 14 10  8 13 10 10  0  9  9 10  0  0  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  8 13  3 10  8 13 10 10  0  9 10 11  6  0  0  0  0  0  0  0
   0  0]
 [10 10 10 10 10  3 14 10 10 10 10 10 10  0  9  9 10 11  6  0  0  0  0  0
   0  0]
 [10 10 10  2  3 14 10 10 10 10  0  9 10 11  6  0  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  3 14 10  8 13 10  0  9  9  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  3 14 14 10 10  0  9 10 10 11  6  0  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2 10  3 14 14 14 10 10  0  9 10 11  6  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  3 14 10  8 13 10 10 11  6  0  9 10 11  0  0  0  0  0  0  0
   0  0]
 [10 10 10  3 14 10  8 13 10 10  0  9  9 10  0  0  0  0  0  0  0  0  0  0
   0  0]
 [10 10 10  2  8 13  3 10  8 13 10 10  0  9 10 11  6  0  0  0  0  0  0  0
   0  0]
 [10

In [15]:
prepped_data

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 26), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 26, 15), dtype=tf.float32, name=None))>

**Train-Test Split**

In [15]:
total_size = 0
for _ in prepped_data:
    total_size += 1

train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

train_dataset = prepped_data.take(train_size)
val_dataset = prepped_data.skip(train_size).take(val_size)
test_dataset = prepped_data.skip(train_size+val_size)

In [16]:
train_size, val_size, test_size

(21, 7, 7)

**Training**

In [26]:
num_labels = len(set([tag for sentence in final_data for _, tag, _ in sentence]))
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', config=config)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
optimizer = keras.optimizers.Adam(learning_rate=5e-5)
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = [keras.metrics.Precision(), keras.metrics.Recall(), ner_prep.F1Score()]

In [37]:
callbacks = [
    # keras.callbacks.ModelCheckpoint(filepath='resources\\bert\\checkpoints\\model.{epoch:02d}-{val_loss:.2f}.h5', save_weights_only=True),
    keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=7)
]

In [38]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [39]:
model.layers[-1]

<keras.src.layers.core.dense.Dense at 0x7dc958fd4940>

In [40]:
history_round_1 = model.fit(train_dataset, epochs=50, validation_data=val_dataset, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


**Evaluation**

In [41]:
loss, precision, recall, f1_score = model.evaluate(test_dataset)
print(f'Precsion: {precision}\nRecall: {recall}\nF1 Score: {f1_score}')

Precsion: 0.9464166760444641
Recall: 0.9991577863693237
F1 Score: 0.9720718860626221


In [43]:
ner_prep.predict_labels('i want a large mocha and a donut, the last one was dry', tokenizer, model, label_map, max_seq_length=26)



[('i', 'O'),
 ('want', 'O'),
 ('a', 'O'),
 ('large', 'B-beverage_size'),
 ('mo', 'B-beverage'),
 ('##cha', 'I-beverage'),
 ('and', 'O'),
 ('a', 'O'),
 ('don', 'B-food'),
 ('##ut', 'I-food'),
 ('the', 'O'),
 ('last', 'O'),
 ('one', 'O'),
 ('was', 'O'),
 ('dry', 'I-food_complain')]

**For Reloading/Saving**

In [63]:
prepped_data.save('resources/bert/data/prepped_data')
with open('resources/bert/data/label_map.pkl', 'wb') as f:
    pickle.dump(label_map, f)

In [4]:
prepped_data = tf.data.Dataset.load('resources/bert/data/prepped_data')
with open('resources/bert/data/label_map.pkl', 'rb') as f:
    label_map = pickle.load(f)

In [42]:
model.save_weights('resources/bert/pretrained/bert-base-uncased.h5')

In [None]:
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=15)
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', config=config)

model.load_weights('resources/bert/pretrained/bert-base-uncased.h5')