In [43]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import tensorflow as tf
import pandas as pd
# set random seeds to make this notebook easier to replicate
tf.keras.utils.set_random_seed(33)

# Preprocess data

## Load data

In [3]:
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([line.strip() for line in file.readlines()])
    return data
    

In [4]:
train_sentences = load_data('data/large/train/sentences.txt')
train_labels = load_data('data/large/train/labels.txt')

val_sentences = load_data('data/large/val/sentences.txt')
val_labels = load_data('data/large/val/labels.txt')

test_sentences = load_data('data/large/test/sentences.txt')
test_labels = load_data('data/large/test/labels.txt')

In [5]:
print(train_sentences[0])
print(train_labels[0])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


## Vectorize sentences

In [6]:
vec = tf.keras.layers.TextVectorization(standardize=None)
vec.adapt(['hello world this is a message'])
print(vec.get_vocabulary())
print(vec('hello there world'))

['', '[UNK]', 'world', 'this', 'message', 'is', 'hello', 'a']
tf.Tensor([6 1 2], shape=(3,), dtype=int64)


In [7]:
def get_sentence_vectorizer(sentences):
    vec = tf.keras.layers.TextVectorization(standardize=None)
    vec.adapt(sentences)

    return vec, vec.get_vocabulary()

sentence_vectorizer, vocab = get_sentence_vectorizer(sentences=train_sentences)

## Label encoding

In [8]:
train_labels

array(['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O',
       'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O',
       'O O O O O O O O O O O B-geo I-geo O', ...,
       'B-per I-per O O O B-tim O O O O O O O O O O',
       'B-gpe O B-per I-per O O O O O B-org I-org I-org O O O O',
       'O O O O O O B-geo O O O O O O O O O O O O O O O O'], dtype='<U287')

In [9]:
def get_tags(labels):
    all_tags = set()

    for label in labels:
        tags = label.split(' ')
        all_tags = all_tags.union(tags)
    
    all_tags = sorted(list(all_tags))
    return all_tags

In [10]:
get_tags(train_labels)

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [11]:
def make_tag_map(tags):
    tag_map = {}
    for i, tag in enumerate(tags):
        tag_map[tag] = i
    return tag_map

tag_map = make_tag_map(get_tags(train_labels))
tag_map

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

## Pad labels

In [12]:
def label_vectorizer(labels, tag_map):
    label_ids = []
    for item in labels:
        label_ids.append(list(map(lambda tag: tag_map[tag], item.split(' '))))

    label_ids = tf.keras.utils.pad_sequences(label_ids, padding='post', value=-1)

    return label_ids

In [13]:
print(f"Sentence: {train_sentences[0:2]}")
print(f"Labels: {train_labels[0:2]}")

vec_labels = label_vectorizer(train_labels[0:2], tag_map)
print(f'[0] last 10 tokens: {vec_labels[0][-10:]}')
print(f'[1] last 10 tokens: {vec_labels[1][-10:]}')
# print(len(vec_labels[1]))

Sentence: ['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "']
Labels: ['O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O'
 'O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O']
[0] last 10 tokens: [16 16 16 16 -1 -1 -1 -1 -1 -1]
[1] last 10 tokens: [16 16 16 16 16 16 16 16 16 16]


# Build dataset

In [14]:
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    sentence_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map = tag_map)
    dataset = tf.data.Dataset.from_tensor_slices((sentence_ids, labels_ids))

    return dataset

In [15]:
train_dataset = generate_dataset(train_sentences, train_labels, sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sentences, val_labels,  sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sentences, test_labels,  sentence_vectorizer, tag_map)

# Model

In [16]:
def NER(len_tags, vocab_size, embedding_dim=50):
    model = tf.keras.Sequential(name = 'sequential') 

    # add one to vocab_size if mask_zero = True
    model.add(tf.keras.layers.Embedding(vocab_size + 1, embedding_dim, mask_zero=True))

    model.add(tf.keras.layers.LSTM(units=embedding_dim, return_sequences=True))
    
    model.add(tf.keras.layers.Dense(units=len_tags, activation=tf.nn.log_softmax))

    return model

Built-in loss functions do not provide options to ignore mask values. Our masked loss function allows as to do just that.

In [17]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, ignore_class=-1)
    loss = loss_fn(y_true, y_pred)
    
    return  loss 

In [18]:
def masked_accuracy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32) 

    # Create mask for non-masked values
    mask = tf.not_equal(y_true, -1)
    mask = tf.cast(mask, tf.float32) 

    y_pred_class = tf.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32) 

    matches_true_pred  = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred , tf.float32) 

    # apply mask to create predictions that ignore masked values
    matches_true_pred *= mask

    masked_acc = tf.reduce_sum(matches_true_pred) / tf.maximum(tf.reduce_sum(mask), 1)

    return masked_acc

In [19]:
model = NER(len(tag_map), len(vocab))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1492400   
                                                                 
 lstm (LSTM)                 (None, None, 50)          20200     
                                                                 
 dense (Dense)               (None, None, 17)          867       
                                                                 
Total params: 1,513,467
Trainable params: 1,513,467
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01), 
    loss = masked_loss,
    metrics = [masked_accuracy]
)

In [21]:
BATCH_SIZE = 64
model.fit(
    train_dataset.batch(BATCH_SIZE),
    validation_data=val_dataset.batch(BATCH_SIZE),
    shuffle=True,
    epochs=10
)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2106a3f0d90>

## Evaluate

In [22]:
test_ids = sentence_vectorizer(test_sentences)
print(test_ids)

tf.Tensor(
[[1633 2112   22 ...    0    0    0]
 [2086   65    6 ...    0    0    0]
 [  48 1733 5215 ...    0    0    0]
 ...
 [ 400   44   19 ...    0    0    0]
 [ 684   50 2819 ...    0    0    0]
 [ 203   25   41 ...    0    0    0]], shape=(7194, 70), dtype=int64)


In [25]:
test_label_ids = label_vectorizer(test_labels, tag_map)
print(test_label_ids, test_label_ids.shape)

[[ 2 16 16 ... -1 -1 -1]
 [16 16 16 ... -1 -1 -1]
 [16 16 16 ... -1 -1 -1]
 ...
 [ 3 16 16 ... -1 -1 -1]
 [16 16 16 ... -1 -1 -1]
 [16 16 16 ... -1 -1 -1]] (7194, 70)


In [27]:
y_pred = model.predict(test_ids)
print(y_pred, y_pred.shape)

[[[-8.32166862e+00 -8.63365459e+00 -7.35555053e-01 ... -1.18176727e+01
   -9.10229683e+00 -5.75114536e+00]
  [-1.53517513e+01 -1.49933100e+01 -1.52658243e+01 ... -1.01397734e+01
   -1.01792831e+01 -5.33200160e-04]
  [-2.02329636e+01 -1.97180080e+01 -1.34848385e+01 ... -2.18994331e+01
   -9.43583107e+00 -1.37699174e-03]
  ...
  [-2.88843489e+00 -2.92739391e+00 -2.75706005e+00 ... -3.02077365e+00
   -3.13804150e+00 -2.34480286e+00]
  [-2.88843489e+00 -2.92739391e+00 -2.75706005e+00 ... -3.02077365e+00
   -3.13804150e+00 -2.34480286e+00]
  [-2.88843489e+00 -2.92739391e+00 -2.75706005e+00 ... -3.02077365e+00
   -3.13804150e+00 -2.34480286e+00]]

 [[-1.49555387e+01 -1.80311718e+01 -1.82787533e+01 ... -1.14925070e+01
   -8.59307575e+00 -1.45210640e-03]
  [-1.45132151e+01 -1.54445114e+01 -1.43930969e+01 ... -2.19556389e+01
   -1.05537558e+01 -1.53538175e-02]
  [-1.42145729e+01 -1.10201998e+01 -1.05489779e+01 ... -1.84101124e+01
   -7.71041775e+00 -1.46781909e-03]
  ...
  [-2.88843489e+00 -2.9

In [33]:
accu = masked_accuracy(test_label_ids, y_pred)
print(accu)

tf.Tensor(0.95451057, shape=(), dtype=float32)


# Generate Tags

In [36]:
def predict(sentence):
    sentence_vectorized = sentence_vectorizer(sentence)
    sentence_vectorized = tf.expand_dims(sentence_vectorized, axis=0)

    output = model.predict(sentence_vectorized)
    outputs = np.argmax(output, axis=-1)
    outputs = outputs[0] 

    labels = list(tag_map.keys()) 

    pred = [] 
    for tag_idx in outputs:
        pred_label = labels[tag_idx]
        pred.append(pred_label)
    
    return pred

In [48]:
def display_pred(sentence):
    pred = predict(sentence)

    print(pd.DataFrame({ 'Token': sentence.split(' '), 'Tag': pred }, index=None))

In [49]:
display_pred('F1 Grand Prix happens this weekend in Bahrain')

     Token    Tag
0       F1      O
1    Grand  B-geo
2     Prix  I-org
3  happens      O
4     this      O
5  weekend  B-tim
6       in      O
7  Bahrain  B-geo
