In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!wget https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz
!wget https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz

!unzip -q glove.6B.zip
!gzip -d /content/train.txt.gz
!gzip -d /content/test.txt.gz

--2023-08-24 16:52:02--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-08-24 16:52:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-08-24 16:52:02--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import os
import pathlib
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [None]:
path_to_glove_file = "/content/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
all_text_file = open('/content/train.txt')
all_text_train = all_text_file.read()
all_text_train_splitted = all_text_train.split('\n\n')

all_text_file = open('/content/test.txt')
all_text_test = all_text_file.read()
all_text_test_splitted = all_text_test.split('\n\n')

In [None]:
GLOVE_DIMENSION = 100
UNK_RAND_TOKEN = np.random.randn(GLOVE_DIMENSION)
MAX_LEN = np.array([len(text_split_now.split('\n')) for text_split_now in all_text_train_splitted ]).max()

class_names_dictionary_temp = {}
for sentence in all_text_train_splitted :
  split_sentence = sentence.split('\n')
  if len(split_sentence) > 2 :
    for row in split_sentence :
      col_now = row.split(' ')
      if not (col_now[2] in class_names_dictionary_temp) :
        class_names_dictionary_temp[col_now[2]] = len(class_names_dictionary_temp)

tag_name_bio = []
tag_name_other = []
for key in class_names_dictionary_temp :
  if len(key.split('-')) > 1 :
    if not (key.split('-')[1] in tag_name_bio) :
      tag_name_bio.append(key.split('-')[1])
  else :
    tag_name_other.append(key)

class_names_dictionary = {'[PAD]':0}
for tag_now in tag_name_bio :
  class_names_dictionary["B-" + tag_now] =  len(class_names_dictionary)
  class_names_dictionary["I-" + tag_now] =  len(class_names_dictionary)
for tag_now in tag_name_other :
  class_names_dictionary[tag_now] =  len(class_names_dictionary)

class_names_dictionary_reversed = {class_names_dictionary[key_now]:key_now for key_now in class_names_dictionary}

In [None]:
def preprocess_data(splitted_text_data) :
  samples = []
  labels = []
  all_raw_text = []

  for sentence in splitted_text_data :
    split_sentence = sentence.split('\n')
    if len(split_sentence) > 2 :
      x_now = []
      y_now = []
      raw_text = ""
      for row in split_sentence :
        col_now = row.split(' ')
        col_now[0] = col_now[0].lower()
        if col_now[0] == '-LRB-':
          col_now[0] = '('
        elif col_now[0] == '-RRB-':
          col_now[0] = ')'

        raw_text = raw_text + col_now[0] + " "

        if col_now[0] in embeddings_index :
          x_now.append(embeddings_index[col_now[0]])
        else :
          x_now.append(UNK_RAND_TOKEN)
        y_now.append(class_names_dictionary[col_now[2]])
      x_now = np.stack(x_now)
      y_now = np.array(y_now + [0]*int((MAX_LEN - len(x_now))))
      y_now_one_hot = np.zeros((MAX_LEN,len(class_names_dictionary)))
      y_now_one_hot[np.arange(MAX_LEN), y_now] = 1

      x_now = np.pad(x_now,((0,MAX_LEN - len(x_now)),(0,0)),mode="constant")

      all_raw_text.append(str(raw_text).strip() )
      samples.append(x_now)
      labels.append(y_now_one_hot)
  samples = np.stack(samples)
  labels = np.stack(labels)

  return samples, labels, all_raw_text

In [None]:
class NERModel(keras.Model):
  def __init__(
      self, num_tags, embed_dim=100, hidden_dim = 128
  ):
    super().__init__()
    self.bidirectional_1 = keras.layers.Bidirectional(
        keras.layers.LSTM(hidden_dim//2, return_sequences=True)
      )
    self.bidirectional_2 = keras.layers.Bidirectional(
        keras.layers.LSTM(hidden_dim//2, return_sequences=True)
      )

    self.dense1 = keras.layers.Dense(num_tags)
    self.softmax = tf.keras.layers.Softmax(axis = -1)


  def call(self, inputs, training=False):
    yhat = self.bidirectional_1(inputs)
    yhat = self.bidirectional_2(yhat)
    yhat = self.dense1(yhat)
    yhat = self.softmax(yhat)

    return yhat

  def train_step(self, data):
    x, y = data

    with tf.GradientTape() as tape:
      y_pred = self(x, training=True)
      loss = self.compute_loss(y=y, y_pred=y_pred)

    trainable_vars = self.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))
    for metric in self.metrics:
      if metric.name == "loss":
        metric.update_state(loss)
      else:
        metric.update_state(y, y_pred)
    return {m.name: m.result() for m in self.metrics}


In [None]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
  def __init__(self, name="custom_ner_loss"):
    super().__init__(name=name)

  def call(self, y_true, y_pred):

    loss = -y_true*tf.math.log(y_pred)
    mask = tf.cast((y_true[:,:,0:1] == 0), dtype=tf.float32)
    loss = loss * mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


loss_function = CustomNonPaddingTokenLoss()


In [None]:
samples_train, labels_train, all_raw_text_train = preprocess_data(all_text_train_splitted)
samples_test, labels_test, all_raw_text_test = preprocess_data(all_text_test_splitted)

In [None]:
loss_function = CustomNonPaddingTokenLoss()

ner_model = NERModel(len(class_names_dictionary))
ner_model.build((None, MAX_LEN,GLOVE_DIMENSION))
ner_model.call(tf.keras.layers.Input(shape = (MAX_LEN, GLOVE_DIMENSION)))


<KerasTensor: shape=(None, 78, 24) dtype=float32 (created by layer 'softmax')>

In [None]:
ner_model.summary()

Model: "ner_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 78, 128)          84480     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 78, 128)          98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 78, 24)            3096      
                                                                 
 softmax (Softmax)           (None, 78, 24)            0         
                                                                 
Total params: 186,392
Trainable params: 186,392
Non-trainable params: 0
_________________________________________________________________


In [None]:
ner_model.compile(optimizer="adam", loss=loss_function)

In [None]:
ner_model.fit(samples_train,labels_train, epochs=10)

Epoch 1/10
 39/278 [===>..........................] - ETA: 47s - loss: 1.9364

In [None]:
def preprocess_text(sentence) :
  all_word = sentence.split(' ')

  x_now = []
  y_now = []
  raw_text = ""
  for word in all_word :
    word = word.lower()
    if word == '-LRB-':
      word = '('
    elif word == '-RRB-':
      word = ')'

    if word in embeddings_index :
      x_now.append(embeddings_index[word])
    else :
      x_now.append(UNK_RAND_TOKEN)

  x_now = np.stack(x_now)
  return x_now

In [None]:
def get_tag(model,sentence) :
  sentence_check_feature = preprocess_text(sentence)
  sentence_check_feature = np.stack([sentence_check_feature])
  tag_list = np.argmax(ner_model(sentence_check_feature,training=False).numpy(),-1)[0]

  return tag_list

In [None]:
def decode_tag(tag_list) :
  return [class_names_dictionary_reversed[tag] for tag in tag_list]

In [None]:
idx_check_test = 0
text_now = all_raw_text_test[idx_check_test]

tag_list = get_tag(ner_model,text_now)
decoded_tag = decode_tag(tag_list)

print("text :", text_now)
print("tags :", decoded_tag)
print("real tags :", decode_tag(labels_test[idx_check_test].argmax(-1)[0:len(text_now.split(' '))]  ))

text : rockwell international corp. 's tulsa unit said it signed a tentative agreement extending its contract with boeing co. to provide structural parts for boeing 's 747 jetliners .
tags : ['B-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-NP', 'I-NP', 'I-NP', 'O']
real tags : ['B-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-NP', 'I-NP', 'I-NP', 'O']


In [None]:
text_now = "My name is Andi ."

tag_list = get_tag(ner_model,text_now)
decoded_tag = decode_tag(tag_list)
print(decoded_tag)

['B-NP', 'I-NP', 'B-VP', 'B-NP', 'O']
