<a href="https://colab.research.google.com/github/kk412027247/nlp/blob/main/name_entity_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://gmb.let.rug.nl/releases/gmb-2.2.0.zip
!unzip gmb-2.2.0.zip
!mkdir ner
!pip install tensorflow_addons==0.11.2

In [None]:
import os
data_root = './gmb-2.2.0/data/'
fnames = []

for root, dirs, files in os.walk(data_root):
  for filename in files:
    if filename.endswith('.tags'):
      fnames.append(os.path.join(root, filename))

fnames[:2]



import csv
import collections

ner_tags = collections.Counter()
iob_tags = collections.Counter()

def strip_ner_subcat(tag):
  return tag.split('-')[0]

def iob_format(ners):
  iob_tokens = []
  for idx, token in enumerate(ners):
    if token != 'O':
      if idx == 0:
        token = 'B-' + token
      elif ners[idx-1] == token:
        token = 'I-' + token
      else:
        token = 'B-' + token
    iob_tokens.append(token)
    iob_tags[token] += 1
  return iob_tokens

total_sentences = 0
outfiles=[]

for idx, file in enumerate(fnames):
  with open(file, 'rb') as content:
    data = content.read().decode('utf-8').strip()
    sentences = data.split('\n\n')
    # print(idx, file, len(sentences))
    total_sentences += len(sentences)

    with open('./ner/'+str(idx)+'-'+os.path.basename(file), 'w') as outfile:
      outfiles.append('./ner/'+str(idx)+'-'+os.path.basename(file))
      writer = csv.writer(outfile)

      for sentence in sentences:
        toks = sentence.split('\n')
        words, pos, ner = [], [],[]
        for tok in toks:
          t = tok.split('\t')
          words.append(t[0])
          pos.append(t[1])
          ner_tags[t[3]] +=1
          ner.append(strip_ner_subcat(t[3]))
        writer.writerow([' '.join(words),
                        ' '.join(iob_format(ner)),
                        ' '.join(pos)])
    
print('total number of sentences: ', total_sentences)
print(ner_tags)
print(iob_tags)

In [None]:
import glob
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

files = glob.glob('./ner/*.tags')
data_pd = pd.concat([pd.read_csv(f, header=None, names=['text', 'label', 'pos']) for f in files ], ignore_index=True)
data_pd.info()

text_tok = Tokenizer(filters='[\\]^\t\n', lower=False, split=' ', oov_token='<OOV>')
pos_tok = Tokenizer(filters='\t\n', lower=False, split=' ', oov_token='<OOV>')
ner_tok = Tokenizer(filters='\t\n', lower=False, split=' ', oov_token='<OOV>')

text_tok.fit_on_texts(data_pd['text'])
pos_tok.fit_on_texts(data_pd['pos'])
ner_tok.fit_on_texts(data_pd['label'])

ner_config = ner_tok.get_config()
text_config = text_tok.get_config()

print(ner_config)

text_vocab = eval(text_config['index_word'])
ner_vocab = eval(ner_config['index_word'])
print('unique words in vocab:', len(text_vocab))
print('unique worner tags  in vocab:', len(ner_vocab))

x_tok = text_tok.texts_to_sequences(data_pd['text'])
y_tok = ner_tok.texts_to_sequences(data_pd['label'])

from tensorflow.keras.preprocessing import sequence
max_len = 50

x_pad = sequence.pad_sequences(x_tok, padding='post', maxlen=max_len)
y_pad = sequence.pad_sequences(y_tok, padding='post', maxlen=max_len)

print(x_pad.shape, y_pad.shape)

num_classes = len(ner_vocab) + 1
Y = tf.keras.utils.to_categorical(y_pad, num_classes=num_classes)
Y.shape

vocab_size = len(text_vocab) + 1
embedding_dim = 64
rnn_units = 100
BATCH_SIZE=90
num_classes = len(ner_vocab) + 1

from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense 
dropout = 0.2

def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, classes):
  model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, mask_zero=True,
              batch_input_shape=[batch_size, None]),
    Bidirectional(LSTM(units=rnn_units,
                       return_sequences=True,
                       dropout=dropout,
                       kernel_initializer=tf.keras.initializers.he_normal())),
    TimeDistributed(Dense(rnn_units, activation='relu')),
    Dense(num_classes, activation='softmax')
  ])
  return model

model = build_model_bilstm(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE, classes=num_classes)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

X = x_pad
total_sentences = 62010
test_size = round(total_sentences/BATCH_SIZE *0.2)
X_train = X[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]
X_test = X[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]

model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15)

model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)


In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.layers import Dropout, Bidirectional
import tensorflow_addons as tfa


class CRFLayer(Layer):
  def __init__(self, label_size, mask_id=0, trans_params=None, name='crf', **kwargs):
    super(CRFLayer, self).__init__(name=name, **kwargs)
    self.label_size = label_size
    self.mask_id = mask_id
    self.transition_params = None
    if trans_params is None:
      self.transition_params = tf.Variable(
          tf.random.uniform(shape=(label_size, label_size)), trainable=False)
    else:
      self.transition_params = trans_params
      
def call(self, inputs, seq_lengths, training=None):
  if training is None:
    training = K.learning_phase()
  if training:
    return inputs
  return inputs

class NerModel(tf.keras.Model):
  def __init__(self, hidden_num, vocab_size, label_size, embedding_size, name='BilstmCrfModel', **kwargs):
    super(NerModel, self).__init__(name=name, **kwargs)
    self.num_hidden = hidden_num
    self.vocab_size = vocab_size
    self.label_size = label_size
    self.embedding = Embedding(vocab_size, embedding_size, mask_zero=True, name='embedding')
    self.biLSTM = Bidirectional(LSTM(hidden_num, return_sequences=True), name='bilstm')
    self.dense = TimeDistributed(tf.keras.layers.Dense(label_size), name='dense')
    self.crf = CRFLayer(self.label_size, name='crf')

def call(self, text, labels=None, training=None):
  seq_lengths = tf.math.reduce_sum(tf.cast(tf.math.not_equal(text, 0), dtype=tf.int32), axis=-1)
  if training is None:
    training = K.learning_phase()
    inputs = self.embedding(text)
    bilstm = self.biLSTM(inputs)
    logits = self.dense(bilstm)
    outputs = self.crf(logits, seq_lengths, training)
    return outputs

def loss(self, y_true, y_pred):
  y_pred = tf.convert_to_tensor(y_pred)
  y_true = tf.cast(self.get_proper_labels(y_true), y_pred.dtype)
  seq_lengths = self.get_seq_lengths(y_true)
  log_likelihoods, self.transition_params = tfa.text.crf_log_likelihood(y_pred, y_true, seq_lengths)
  self.transitionparams = tf.Variable(self.transition_params, trainable=False)
  loss= -tf.reduce_mean(log_likelihoods)
  return loss

def get_proper_labels(self, y_true):
  shape = y_true.shape
  if len(shape)>2:
    return tf.argmax(y_true, -1, output_type=tf.int32)
  return y_true

def get_seq_lengths(self, matrix):
  mask = tf.not_equal(matrix, self.mask_id)
  seq_lengths = tf.math.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=-1)
  return seq_lengths


vocab_size = len(text_vocab) + 1
embedding_dim = 64
rnn_units = 100
BATCH_SIZE = 90
num_classes = len(ner_vocab) + 1
blc_model = NerModel(rnn_units, vocab_size, num_classes, embedding_dim, dynamic=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)
X_train = x_pad[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]
X_test = x_pad[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]
Y_train_int = tf.cast(Y_train, dtype=tf.int32)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train_int))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

loss_metric = tf.kears.metrics.Mean()
epochs = 5
for epoch in range(epochs):
  print('Start of epoch %d' (epoch,))
  for step, (text_batch, labels_batch) in enumerate(train_dataset):
    labels_max = tf.argmax(labels_batch, -1, output_type=tf.int32)
    with tf.GradientTape() as tape:
      logits = blc_model(text_batch, trainint=True)
      loss = blc_model.crf.loss(labels_max, logits)
      grads = tape.gradient(loss, blc_model.trainable_weights)
      optimizer.apply_gradients(zip(grads, blc_model.trainnable_weights))
      loss_metric(loss)
    if step % 50 == 0:
      print('step %s: mean loss = %s' % (step, loss_metric.result()))
