In [226]:
import os
import joblib
from tensorflow import keras
import json
from typing import List, Tuple
from transformers import BertTokenizer, TFBertForTokenClassification
import tensorflow as tf

# os.chdir('/content/drive/Othercomputers/AKATSUKI-PC/PycharmProjects/chatopotamus')

In [229]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

# Load files
with open('resources/bert/data/ner_dataset.json', 'r') as file:
	ner_dataset_json = json.load(file)

with open('resources/bert/data/ner_lookup.json', 'r') as file:
	ner_lookup = json.load(file)

num_labels = len(ner_lookup['entities'])*2 + 1

# Load pre-trained model (weights)
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

**Convert dataset into IOB tagged data**

In [218]:
import re

def convert_to_IOB(dataset, lookup):
    iob_data = []

    # Split entities into one-word and two-word groups
    one_word_entities = {}
    two_word_entities = {}
    for type, values in lookup['entities'].items():
        one_word_entities[type] = []
        two_word_entities[type] = []
        for value in values:
            entity_split = value.split(' ')
            if len(entity_split) == 1:
                one_word_entities[type].append(entity_split[0].lower())
            else:
                two_word_entities[type].append(' '.join(entity_split).lower())

    for instance in dataset['instances']:
        original_sentence = instance['sentence']
        # Remove punctuation for matching, but keep the original sentence for output
        sentence = re.sub(r'[^\w\s]', '', original_sentence)
        words = sentence.split(' ')
        words_lower = [word.lower() for word in words]
        this_sentence_tags = ['O'] * len(words)

        # Function to update tags for a given entity type
        def update_tags(entity_values, tag_prefix):
            for entity in entity_values:
                start_index = 0
                while start_index < len(words_lower):
                    try:
                        start_index = words_lower.index(entity.split(' ')[0], start_index)
                        end_index = start_index + len(entity.split(' '))
                        if ' '.join(words_lower[start_index:end_index]) == entity:
                            this_sentence_tags[start_index] = f'B-{tag_prefix}'
                            for i in range(start_index + 1, end_index):
                                this_sentence_tags[i] = f'I-{tag_prefix}'
                        start_index = end_index
                    except ValueError:
                        break

        # Check for two-word entities
        for entity_type, values in two_word_entities.items():
            update_tags(values, entity_type)

        # Check for one-word entities
        for entity_type, values in one_word_entities.items():
            for i, word in enumerate(words_lower):
                if word in values and this_sentence_tags[i] == 'O':
                    this_sentence_tags[i] = f'B-{entity_type}'

        # Pair the original words (with punctuation) with their IOB tags
        original_words = original_sentence.split(' ')
        iob_data.append((original_sentence, this_sentence_tags))

    return iob_data

In [219]:
iob_data = convert_to_IOB(ner_dataset_json, ner_lookup)

In [None]:
iob_data[0]

**Tokenization and Handling Subword Tokens**

**For Reloading/Saving**

In [244]:
# Save important variables
joblib.dump(iob_data, 'resources/bert/data/iob_data.pkl')
joblib.dump(tokenizer, 'resources/bert/pretrained/tokenizer.pkl')
model.save_weights('resources/bert/pretrained/bert-base-uncased.h5')

In [None]:
# Load variables
tokenizer = joblib.load('resources/bert/pretrained/tokenizer.pkl')
model = keras.models.load_model('resources/bert/pretrained/bert-base-uncased.h5')
iob_dataset = joblib.load('resources/bert/data/iob_dataset.pkl')