In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
def load_data(file_path):
    with open(file_path,'r') as file:
        data = np.array([line.strip() for line in file.readlines()])
    return data


In [None]:
data = pd.read_csv("/content/ner_dataset.csv", encoding="ISO-8859-1")
train_sents = load_data("/content/sentences.txt")
train_labels = load_data("/content/labels.txt")

In [None]:
val_sents = load_data("/content/val_sentences.txt")
val_label = load_data("/content/val_labels.txt")
print('SENTENCE:', val_sents)
print('SENTENCE LABEL:', val_label)

SENTENCE: ["Russia 's victory put the eight-time Olympic champions into the quarterfinals and also clinched a spot for Sweden ."
 'Slovakia advanced with a win over the United States ( 02-Jan ) on Saturday , leaving one remaining spot from Group-B .'
 'China has announced its sixth human bird flu death .' ...
 'Other sources of income are pearl farming and deep-sea commercial fishing .'
 'The small manufacturing sector primarily processes agricultural products .'
 'The territory benefits substantially from development agreements with France aimed principally at creating new businesses and strengthening social services .']
SENTENCE LABEL: ['B-geo O O O O O O O O O O O O O O O O B-org O'
 'B-geo O O O O O O B-geo I-geo O O O O B-tim O O O O O O B-art O'
 'B-org O O O O O O O O O' ... 'O O O O O O O O O O O O'
 'O O O O O O O O O' 'O O O O O O O O B-geo O O O O O O O O O O O']


In [None]:
test_sents = load_data("/content/test_sentences.txt")
test_label = load_data("/content/test_labels.txt")
print('SENTENCE:', test_sents)
print('SENTENCE LABEL:', test_label)

SENTENCE: ['Argentina benefits from rich natural resources , a highly literate population , an export-oriented agricultural sector , and a diversified industrial base .'
 "Although one of the world 's wealthiest countries 100 years ago , Argentina suffered during most of the 20th century from recurring economic crises , persistent fiscal and current account deficits , high inflation , mounting external debt , and capital flight ."
 "A severe depression , growing public and external indebtedness , and a bank run culminated in 2001 in the most serious economic , social , and political crisis in the country 's turbulent history ."
 ...
 "Indian officials said no one was injured in Saturday 's incident but that two of the rockets landed near a border security outpost ."
 'Two more landed in fields belonging to a nearby village .'
 'They say not all of the rockets exploded upon impact .']
SENTENCE LABEL: ['B-geo O O O O O O O O O O O O O O O O O O O O O O'
 'O O O O O O O O O O O O B-geo O 

In [None]:
help(tf.keras.layers.TextVectorization)

In [None]:

max_len = 0
for sentence in open("/content/sentences.txt", "r"):
  sentence = sentence.strip()
  words = sentence.split()
  max_len = max(max_len, len(words))

print("Maximum sentence length (in words):", max_len)


Maximum sentence length (in words): 104


In [None]:
# GRADED FUNCTION: get_sentence_vectorizer
def get_sentence_vectorizer(sentences):
    tf.keras.utils.set_random_seed(33) ## Do not change this line.
    ### START CODE HERE ###
    # Define TextVectorization object with the appropriate standardize parameter
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, )
    # Adapt the sentence vectorization object to the given sentences
    sentence_vectorizer.adapt(sentences)
    # Get the vocabulary
    vocab = sentence_vectorizer.get_vocabulary()

    ### END CODE HERE ###

    return sentence_vectorizer, vocab

In [None]:
test_vectorizer, test_vocab = get_sentence_vectorizer(train_sents[:1000])
print(f"Test vocab size: {len(test_vocab)}")

sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
print(f"Sentence: {sentence}\nSentence vectorized: {sentence_vectorized}")

Test vocab size: 4650
Sentence: I like learning new NLP models !
Sentence vectorized: [ 296  314    1   59    1    1 4649]


In [None]:
sentence_vectorizer, vocab = get_sentence_vectorizer(train_sents)
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 29847


In [None]:
def get_tags(labels):
    tag_set = set() # Define an empty set
    for el in labels:
        for tag in el.split(" "):
            tag_set.add(tag)
    tag_list = list(tag_set)
    tag_list.sort()
    return tag_list

In [None]:
tags = get_tags(train_labels)
print(tags)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [None]:
def make_tag_map(tags):
    tag_map = {}
    for idx, tag in enumerate(tags):
        tag_map[tag] = idx
    return tag_map

In [None]:
tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}


In [None]:
# GRADED FUNCTION: label_vectorizer
def label_vectorizer(labels, tag_map):
    """
    Convert list of label strings to padded label IDs using a tag mapping.

    Parameters:
    labels (list of str): List of label strings.
    tag_map (dict): Dictionary mapping tags to IDs.
    Returns:
    label_ids (numpy.ndarray): Padded array of label IDs.
    """
    label_ids = [] # It can't be a numpy array yet, since each sentence has a different size

    ### START CODE HERE ###

    # Each element in labels is a string of tags so for each of them:
    for element in labels:
        # Split it into single tokens. You may use .split function for strings. Be aware to split it by a blank space!
        tokens = element.split(" ")

        # Use the dictionaty tag_map passed as an argument to the label_vectorizer function
        # to make the correspondence between tags and numbers.
        element_ids = [tag_map[tag] for tag in tokens]

        # Append the found ids to corresponding to the current element to label_ids list
        label_ids.append(element_ids)

    # Pad the elements
    label_ids = tf.keras.utils.pad_sequences(label_ids, padding='post', value=-1)

    ### END CODE HERE ###

    return label_ids

In [None]:
print(f"Sentence: {train_sents[5]}")
print(f"Labels: {train_labels[5]}")
print(f"Vectorized labels: {label_vectorizer([train_labels[5]], tag_map)}")

Sentence: The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
Labels: O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
Vectorized labels: [[16 16 16 16 16  3 16 16 16 16  2 16 16 16 16 16 16 16  3 16 16 16 16 16]]


In [None]:
def generate_dataset(sentences, labels, sentence_vectorizer, tag_map):
    sentences_ids = sentence_vectorizer(sentences)
    labels_ids = label_vectorizer(labels, tag_map = tag_map)
    dataset = tf.data.Dataset.from_tensor_slices((sentences_ids, labels_ids))
    return dataset

In [None]:
train_dataset = generate_dataset(train_sents,train_labels, sentence_vectorizer, tag_map)
val_dataset = generate_dataset(val_sents,val_label,  sentence_vectorizer, tag_map)
test_dataset = generate_dataset(test_sents, test_label,  sentence_vectorizer, tag_map)

In [None]:
# Exploring information about the training data
print(f'The number of outputs is {len(tags)}')
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words in the training set: {g_vocab_size}")
print('The training size is', len(train_dataset))
print('The validation size is', len(val_dataset))
print('An example of the first sentence is\n\t', next(iter(train_dataset))[0].numpy())
print('An example of its corresponding label is\n\t', next(iter(train_dataset))[1].numpy())

The number of outputs is 17
Num of vocabulary words in the training set: 29847
The training size is 33570
The validation size is 7194
An example of the first sentence is
	 [1046    6 1121   18 1832  232  543    7  528    2  158    5   60    9
  648    2  922    6  192   87   22   16   54    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
An example of its corresponding label is
	 [16 16 16 16 16 16  2 16 16 16 16 16  2 16 16 16 16 16  3 16 16 16 16 16
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 

In [None]:

def NER(len_tags, vocab_size, embedding_dim = 50):

    model = tf.keras.Sequential(name='sequential')

    # Add the tf.keras.layers.Embedding layer. Mask zeros to ignore padding in the sequences.
    model.add(tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                        output_dim=embedding_dim,
                                        mask_zero=True))  # Masking zero as padding

    # Add the LSTM layer. Make sure it returns the full sequence (return_sequences=True).
    model.add(tf.keras.layers.LSTM(units=embedding_dim, return_sequences=True))

    # Add the final Dense layer with log softmax activation.
    # len_tags is the number of NER tags (output classes)
    model.add(tf.keras.layers.Dense(len_tags, activation=tf.nn.log_softmax))

    return model





In [None]:

def masked_loss(y_true, y_pred):


    y_true = tf.convert_to_tensor(y_true)
    y_pred = tf.convert_to_tensor(y_pred)
    # Calculate the loss for each item in the batch. Remember to pass the right arguments, as discussed above!
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, ignore_class=-1)
    # Use the previous defined function to compute the loss
    loss = loss_fn(y_true, y_pred)

    ### END CODE HERE ###

    return  loss

In [None]:
true_labels = [0,1,2,0]
predicted_logits = [[-2.3,-0.51,-1.20] , [-1.61,-0.36,-2.30], [-2.30, -0.69,-0.92], [-0.92,-0.92,-1.61]]
print(masked_loss(true_labels, predicted_logits))

tf.Tensor(1.1242604, shape=(), dtype=float32)


In [None]:

def masked_accuracy(y_true, y_pred):

    y_true = tf.cast(y_true, tf.float32)
    # Create the mask, i.e., the values that will be ignored
    mask = y_true != -1  # Mask where labels are valid (not -1)
    mask = tf.cast(mask, tf.float32)
    # Perform argmax to get the predicted values
    y_pred_class = tf.argmax(y_pred, axis=-1)  # Get predicted class index
    y_pred_class = tf.cast(y_pred_class, tf.float32)
    # Compare the true values with the predicted ones
    matches_true_pred  = tf.equal(x=y_true, y=y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred , tf.float32)
    # Multiply the acc tensor with the masks
    matches_true_pred *= mask
    # Compute masked accuracy (quotient between the total matches and the total valid values, i.e., the amount of non-masked values)
    # Cast tf.math.count_nonzero(mask) to tf.float32 to match the data type of tf.reduce_sum(matches_true_pred)
    masked_acc = tf.reduce_sum(matches_true_pred) / tf.cast(tf.math.count_nonzero(mask), tf.float32)

    ### END CODE HERE ###

    return masked_acc

In [None]:
true_labels = [0,1,2,0]
predicted_logits = [[0.1,0.6,0.3] , [0.2,0.7,0.1], [0.1, 0.5,0.4], [0.4,0.4,0.2]]
print(masked_accuracy(true_labels, predicted_logits))

tf.Tensor(0.5, shape=(), dtype=float32)


In [None]:
model = NER(len(tag_map), len(vocab))
model.summary()

In [None]:
x = tf.expand_dims(np.array([545, 467, 896]), axis = 0) # Expanding dims is needed to pass it to the model,
                                                        # since it expects batches and not single prediction arrays

x_padded = tf.expand_dims(np.array([545, 467, 896, 0, 0, 0]), axis = 0)

In [None]:
x

<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[545, 467, 896]])>

In [None]:
x_padded

<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[545, 467, 896,   0,   0,   0]])>

In [None]:
pred_x = model(x)
pred_x_padded = model(x_padded)
print(f'x shape: {pred_x.shape}\nx_padded shape: {pred_x_padded.shape}')

x shape: (1, 3, 17)
x_padded shape: (1, 6, 17)


In [None]:
np.allclose(pred_x, pred_x[:3])

True

In [None]:
y_true = tf.expand_dims([16, 6, 12], axis = 0)
y_true_padded = tf.expand_dims([16,6,12,-1,-1,-1], axis = 0) # Remember you mapped the padded values to -1 in the labels
print(f"masked_loss is the same: {np.allclose(masked_loss(y_true,pred_x), masked_loss(y_true_padded,pred_x_padded))}")
print(f"masked_accuracy is the same: {np.allclose(masked_accuracy(y_true,pred_x), masked_accuracy(y_true_padded,pred_x_padded))}")

masked_loss is the same: True
masked_accuracy is the same: True


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
              loss = masked_loss,
               metrics = [masked_accuracy])

In [None]:
tf.keras.utils.set_random_seed(33)

BATCH_SIZE = 64

model.fit(train_dataset.batch(BATCH_SIZE),
          validation_data = val_dataset.batch(BATCH_SIZE),
          shuffle=True,
          epochs = 10)

Epoch 1/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 124ms/step - loss: 0.4594 - masked_accuracy: 0.8952 - val_loss: 0.1393 - val_masked_accuracy: 0.9573
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 129ms/step - loss: 0.1299 - masked_accuracy: 0.9612 - val_loss: 0.1359 - val_masked_accuracy: 0.9584
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 126ms/step - loss: 0.1025 - masked_accuracy: 0.9679 - val_loss: 0.1387 - val_masked_accuracy: 0.9580
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 124ms/step - loss: 0.0886 - masked_accuracy: 0.9714 - val_loss: 0.1452 - val_masked_accuracy: 0.9575
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 121ms/step - loss: 0.0801 - masked_accuracy: 0.9736 - val_loss: 0.1536 - val_masked_accuracy: 0.9573
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 122ms/step - loss: 0.0735 -

<keras.src.callbacks.history.History at 0x7effc3f9d600>

In [None]:
# Convert the sentences into ids
test_sentences_id = sentence_vectorizer(test_sents)
# Convert the labels into token ids
test_labels_id = label_vectorizer(test_label,tag_map)
# Rename to prettify next function call
y_true = test_labels_id
y_pred = model.predict(test_sentences_id)
print(f"The model's accuracy in test set is: {masked_accuracy(y_true,y_pred).numpy():.4f}")

[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step
The model's accuracy in test set is: 0.9530


In [None]:
# GRADED FUNCTION: predict
def predict(sentence, model, sentence_vectorizer, tag_map):

    sentence_vectorized = sentence_vectorizer(sentence)
    # Expand its dimension to make it appropriate to pass to the model
    sentence_vectorized = tf.expand_dims(sentence_vectorized, axis = 0)
    # Get the model output
    output = model.predict(sentence_vectorized)
    # Get the predicted labels for each token, using argmax function and specifying the correct axis to perform the argmax
    outputs = np.argmax(output, axis = -1)
    outputs = outputs[0]
    # Get a list of all keys, remember that the tag_map was built in a way that each label id matches its index in a list
    labels = list(tag_map.keys())
    pred = []
    # Iterating over every predicted token in outputs list
    for tag_id in outputs:
        # Append the corresponding label to the predictions list
        pred.append(labels[tag_id])


    ### END CODE HERE ###

    return pred

In [None]:
sentence = "Keshav Singh, A Artificial Inteligence enthusiast , said in an interview on Sunday morning that the model was working to prepare for the possibility of a second wave of optimisation, though he said it wouldn ’t necessarily come"
predictions = predict(sentence, model, sentence_vectorizer, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
Singh, I-per
A I-art
Artificial B-tim
Inteligence I-art
enthusiast I-per
Sunday B-tim
morning I-tim


In [None]:

import io

weights = model.layers[0].get_weights()[0]
vocab = sentence_vectorizer.get_vocabulary()

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>