In [None]:
!pip3 install datasets
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

In [2]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datasets import load_dataset
from collections import Counter
from conlleval import evaluate

In [3]:

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [4]:

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings


In [5]:

class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x


In [6]:
mapping = {'[PAD]': 0, 'O': 1, 'B-Facility': 2, 'I-Facility': 3, 
'B-OtherLOC': 4, 'I-OtherLOC': 5, 'B-HumanSettlement': 6, 
'I-HumanSettlement': 7, 'B-Station': 8, 'I-Station': 9, 
'B-VisualWork': 10, 'I-VisualWork': 11, 'B-MusicalWork': 12, 
'I-MusicalWork': 13, 'B-WrittenWork': 14, 'I-WrittenWork': 15, 
'B-ArtWork': 16, 'I-ArtWork': 17, 'B-Software': 18, 
'I-Software': 19, 'B-MusicalGRP': 20, 'I-MusicalGRP': 21, 
'B-PublicCORP': 22, 'I-PublicCORP': 23, 'B-PrivateCORP': 24, 
'I-PrivateCORP': 25, 'B-AerospaceManufacturer': 26, 'I-AerospaceManufacturer': 27, 
'B-SportsGRP': 28, 'I-SportsGRP': 29, 'B-CarManufacturer': 30, 
'I-CarManufacturer': 31, 'B-ORG': 32, 'I-ORG': 33, 
'B-Scientist': 34, 'I-Scientist': 35, 'B-Artist': 36, 
'I-Artist': 37, 'B-Athlete': 38, 'I-Athlete': 39, 
'B-Politician': 40, 'I-Politician': 41, 'B-Cleric': 42, 
'I-Cleric': 43, 'B-SportsManager': 44, 'I-SportsManager': 45, 
'B-OtherPER': 46, 'I-OtherPER': 47, 'B-Clothing': 48, 
'I-Clothing': 49, 'B-Vehicle': 50, 'I-Vehicle': 51, 
'B-Food': 52, 'I-Food': 53, 'B-Drink': 54, 
'I-Drink': 55, 'B-OtherPROD': 56, 'I-OtherPROD': 57, 
'B-Medication/Vaccine': 58, 'I-Medication/Vaccine': 59, 'B-MedicalProcedure': 60, 
'I-MedicalProcedure': 61, 'B-AnatomicalStructure': 62, 'I-AnatomicalStructure': 63, 
'B-Symptom': 64, 'I-Symptom': 65, 'B-Disease': 66, 'I-Disease': 67}


result1=[[]]
result2=[[]]
result3=[[]]
result11=[[]]
with open('en_train.conll',"r") as input:
    for l in input:
        if not l.startswith("#"):
            if l.strip()=="":
                if len(result1[-1])>0:
                    result1.append([])
                if len(result2[-1])>0:
                    result2.append([])
                if len(result3[-1])>0:
                    result3.append([])
                if len(result11[-1])>0:
                    result11.append([])

            else:
                result1[-1].append(l.split()[0])
                result2[-1].append(l.split()[3])
                result3[-1].append(str(mapping.get(l.split()[3])))
                result11[-1].append(l.split()[0])
                
result1=[ "\t".join(row1) for row1 in result1 ]
result2=[ "\t".join(row2) for row2 in result2 ]
result3=[ "\t".join(row3) for row3 in result3 ]



In [7]:
def export_to_file(export_file_path, result1,result3):

    with open(export_file_path, "w") as f:
        i = 0;
        for record in result1:
            if len(record.split()) > 0:
                f.write(
                    str(len(record.split()))
                    + "\t"
                    + record
                    + "\t"
                    + result3[i].replace('None','1')
                    + "\n"
                )
                i = i + 1


export_to_file("conll_train.txt", result1,result3)

In [8]:
vresult1=[[]]
vresult2=[[]]
vresult3=[[]]
vresult11=[[]]
with open('en_test.conll',"r") as input:
    for l in input:
        if not l.startswith("#"):
            if l.strip()=="":
                if len(vresult1[-1])>0:
                    vresult1.append([])
                if len(vresult2[-1])>0:
                    vresult2.append([])
                if len(vresult3[-1])>0:
                    vresult3.append([])
                if len(vresult11[-1])>0:
                    vresult11.append([])

            else:
                vresult1[-1].append(l.split()[0])
                vresult2[-1].append(l.split()[3])
                vresult3[-1].append(str(mapping.get(l.split()[3])))
                vresult11[-1].append(l.split()[0])
                
vresult1=[ "\t".join(row1) for row1 in vresult1 ]
vresult2=[ "\t".join(row2) for row2 in vresult2 ]
vresult3=[ "\t".join(row3) for row3 in vresult3 ]

export_to_file("conll_val.txt", vresult1,vresult3)

In [9]:
all_tokens = sum(result11, [])
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens_array)
print(len(counter))



34140


In [10]:
num_tags = len(mapping)
vocab_size = 75000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(
    vocabulary=vocabulary
)

In [11]:
train_data = tf.data.TextLineDataset("conll_train.txt")
val_data = tf.data.TextLineDataset("conll_val.txt")

In [12]:

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    #tags += 1
    return tokens, tags


def lowercase_and_convert_to_ids(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)


# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 32
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [13]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)


loss = CustomNonPaddingTokenLoss()

In [14]:
mapping2 = {0: '[PAD]', 1: 'O', 2: 'B-Facility', 3: 'I-Facility', 
4: 'B-OtherLOC', 5: 'I-OtherLOC', 6: 'B-HumanSettlement', 
7: 'I-HumanSettlement', 8: 'B-Station', 9: 'I-Station', 
10: 'B-VisualWork', 11: 'I-VisualWork', 12: 'B-MusicalWork', 
13: 'I-MusicalWork', 14: 'B-WrittenWork', 15: 'I-WrittenWork', 
16: 'B-ArtWork', 17: 'I-ArtWork', 18: 'B-Software', 
19: 'I-Software', 20: 'B-MusicalGRP', 21: 'I-MusicalGRP', 
22: 'B-PublicCORP', 23: 'I-PublicCORP', 24: 'B-PrivateCORP', 
25: 'I-PrivateCORP', 26: 'B-AerospaceManufacturer', 27: 'I-AerospaceManufacturer', 
28: 'B-SportsGRP', 29: 'I-SportsGRP', 30: 'B-CarManufacturer', 
31: 'I-CarManufacturer', 32: 'B-ORG', 33: 'I-ORG', 
34: 'B-Scientist', 35: 'I-Scientist', 36: 'B-Artist', 
37: 'I-Artist', 38: 'B-Athlete', 39: 'I-Athlete', 
40: 'B-Politician', 41: 'I-Politician', 42: 'B-Cleric', 
43: 'I-Cleric', 44: 'B-SportsManager', 45: 'I-SportsManager', 
46: 'B-OtherPER', 47: 'I-OtherPER', 48: 'B-Clothing', 
49: 'I-Clothing', 50: 'B-Vehicle', 51: 'I-Vehicle', 
52: 'B-Food', 53: 'I-Food', 54: 'B-Drink', 
55: 'I-Drink', 56: 'B-OtherPROD', 57: 'I-OtherPROD', 
58: 'B-Medication/Vaccine', 59: 'I-Medication/Vaccine', 60: 'B-MedicalProcedure', 
61: 'I-MedicalProcedure', 62: 'B-AnatomicalStructure', 63: 'I-AnatomicalStructure', 
64: 'B-Symptom', 65: 'I-Symptom', 66: 'B-Disease', 67: 'I-Disease'}

In [15]:
ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(train_dataset, epochs=50)


def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)


# Sample inference using the trained model
sample_input = tokenize_and_convert_to_ids(
    "robert	gottschalk	1939	academy	award	winner	and	founder	of	panavision"
)
sample_input = tf.reshape(sample_input, shape=[1, -1])
print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping2[i] for i in prediction]

print(prediction)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
tf.Tensor([[  232 13235  1290   909   944   881     5   333     3 13236]], shape=(1, 10), dtype=int64)
['B-OtherPER', 'I-OtherPER', 'O', 'B-VisualWork', 'I-VisualWork', 'O', 'O', 'O', 'O', 'B-ORG']


In [16]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping2[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping2[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)


calculate_metrics(val_dataset)

processed 777918 tokens with 86694 phrases; found: 133734 phrases; correct: 15949.
accuracy:  27.49%; (non-O)
accuracy:  77.28%; precision:  11.93%; recall:  18.40%; FB1:  14.47
AerospaceManufacturer: precision:   0.15%; recall:  50.00%; FB1:   0.30  675
AnatomicalStructure: precision:   0.00%; recall:   0.00%; FB1:   0.00  656
          ArtWork: precision:   0.25%; recall:  12.00%; FB1:   0.48  1220
           Artist: precision:  27.24%; recall:  23.32%; FB1:  25.13  30442
          Athlete: precision:  13.24%; recall:   7.89%; FB1:   9.88  5697
  CarManufacturer: precision:   0.43%; recall:  50.00%; FB1:   0.86  5100
           Cleric: precision:   0.29%; recall:  17.11%; FB1:   0.58  8847
         Clothing: precision:   0.25%; recall:  16.67%; FB1:   0.49  399
          Disease: precision:   9.09%; recall:  35.00%; FB1:  14.43  77
            Drink: precision:   0.00%; recall:   0.00%; FB1:   0.00  1529
         Facility: precision:   1.75%; recall:   4.20%; FB1:   2.47  1259
      