##BiLSTM Approach on Chinese NER

In [106]:
# if this cell prints "Running on cpu", you must switch runtime environments
# go to Runtime > Change runtime type > Hardware accelerator > GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on {}".format(device))

Running on cuda


In [107]:
#We install seqeval to evaluate our model and get a f1 score. 
# !pip install seqeval[gpu]

In [108]:
import numpy as np
import pandas as pd
import math
import matplotlib
import sys, re
from os import listdir
from os.path import isfile, join

import torch
import numpy as np
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence

from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

###Character Level Embedding

We ontained character level embedding from this site: https://github.com/jiesutd/LatticeLSTM, which is used by this paper: https://www.aclweb.org/anthology/P18-1144/

In [109]:
def read_embeddings(filename, vocab_size=10000):
    # get the embedding size from the first embedding
    with open(filename, encoding="utf-8") as file:
        word_embedding_dim = len(file.readline().split(" ")) - 2

    vocab = {}

    embeddings = np.zeros((vocab_size, word_embedding_dim))

    with open(filename, encoding="utf-8") as file:
        for idx, line in enumerate(file):
            if idx >= vocab_size:
                break
            cols = line.rstrip().split(" ")
            val = np.array(cols[1:])
            char = cols[0]
            embeddings[idx] = val
            vocab[char] = idx

    return torch.FloatTensor(embeddings), vocab

In [110]:
# this loads the 10,000 most common char embeddings
vocab_size = 10000
embeddings, vocab = read_embeddings('gigaword_chn.all.a2b.uni.ite50.txt', vocab_size)

###Data Preparation

In [111]:
class Dataset():
    def __init__(self, filename):
        
        self.chars, self.tags = self.read_data(filename)
        self.chars_chunk = []
        self.tags_chunk = []


    def read_data(self, book):
        """
        Utility function, loads text file into a list of character and tag strings

        Returns:
        - chars:    a list of characters
        - tags:         a list of tags for each character, where tags[i] contains
                        a list of tags (strings) that correspond to the chars in 
                        chars[i]
        """
        chars = []
        tags = []

        current_char = []
        current_tags = []

        book = book.split("\n")
        
        for line in book:
            if line == "":
                # print("!!!!!")
                if len(current_char) != 0:
                    # print(current_char)
                    # print(current_tags)
                    chars.append(current_char)
                    tags.append(current_tags)
                    
                current_char = []
                current_tags = []
            else:
                columns = line.rstrip().split('\t')
                char = columns[0].lower()
                tag = columns[1]
                
                current_char.append(char)
                current_tags.append(tag)
    
        return chars, tags
    
        

    def get_batches(self, batch_size, vocab, tagset, start, end, omit):
        """

        Batches the data into mini-batches of size `batch_size`

        Arguments:
        - batch_size:       the desired output batch size
        - vocab:            a dictionary mapping char strings to indices
        - tagset:           a dictionary mapping tag strings to indices

        Outputs:

        if is_labeled=True:
        - batched_char_indices:     a list of matrices of dimension (batch_size x max_seq_len)
        - batched_tag_indices:      a list of matrices of dimension (batch_size x max_seq_len)
        - batched_lengths:          a list of arrays of length (batch_size)

        batched_char_indices[b] is a (batch_size x max_seq_len) matrix of integers, 
        containing index representations for characters in the b-th batch in the document. 

        batched_lengths[b] is a vector of length (batch_size). batched_lengths[b][i] 
        contains the original char length *before* padding for the i-th char in the currrent batch. 

        """
        PAD_INDEX = 0             # reserved for padding words, all chars shorter than max_seq_len should be padded on the right with PAD_INDEX (0).
        UNKNOWN_INDEX = 1         # reserved for unknown words, if a char is not in the vocabulary, it gets mapped to UNKNOWN_INDEX (1).
        IGNORE_TAG_INDEX = -100   # reserved for padding tags, all tag lists shorter than `max_seq_len` are padded with IGNORE_TAG_INDEX (-100).
       
        batched_char_indices = []
        batched_tag_indices = []
        batched_lengths = []
        
        if omit == -1: # for dev sets
            chars = self.chars_chunk[start:end][0]
            tags = self.tags_chunk[start:end][0]
        else: # for training sets
            chars = [self.chars_chunk[i] for i in range(start, end) if i != omit]
            tags = [self.tags_chunk[i] for i in range(start, end) if i != omit]
            
            temp_char = []
            temp_tag = []
            for i in range(1, len(chars)):
                temp_char += chars[i]
                temp_tag += tags[i]
            chars = temp_char
            tags = temp_tag

        for num_batch in range(math.ceil(len(chars) / batch_size)):
            char_list = np.array(chars[num_batch * batch_size : min((num_batch + 1) * batch_size, len(chars))])
            #batched_lengths
            length_array = np.zeros(len(char_list))
            #batched_char_indices
            max_seq_len = len(max(char_list, key=len))
            matrix = np.zeros((min(batch_size, len(char_list)), max_seq_len))
            for i in range(len(char_list)):
                matrix[i] = [vocab[word] if word in vocab else UNKNOWN_INDEX for word in char_list[i]] + [PAD_INDEX for i in range(max_seq_len - len(char_list[i]))]
                length_array[i] = len(char_list[i])
            batched_char_indices.append(matrix)
            batched_lengths.append(length_array)


        #batched_tag_indices
        for num_batch in range(math.ceil(len(tags) / batch_size)):
            tag_list = np.array(tags[num_batch * batch_size : min((num_batch + 1) * batch_size, len(tags))])
            max_seq_len = len(max(tag_list, key=len))
            matrix = np.zeros((min(batch_size, len(tag_list)), max_seq_len))
            for i in range(len(tag_list)):
                matrix[i] = [tagset[word] if word in tagset else UNKNOWN_INDEX for word in tag_list[i]] + [IGNORE_TAG_INDEX for i in range(max_seq_len - len(tag_list[i]))]
            batched_tag_indices.append(matrix)

            
        return batched_char_indices, batched_tag_indices, batched_lengths


In [112]:
def read_tagset(tag_file):
    """
    Utility function, loads tag file into a dictionary from tag string to tag index
    """
    tagset = {}
    with open(tag_file, encoding='utf8') as f:
        for line in f:
            columns = line.rstrip().split('\t')
            tag = columns[0]
            tag_id = int(columns[1])
            tagset[tag] = tag_id

    return tagset

In [113]:
def combine(folder):
    filenames = [f for f in listdir(folder) if isfile(join(folder, f))]
    
    out = ""
    for i in range(len(filenames)):
        with open(folder + '/' + filenames[i], encoding='utf-8') as infile: 
            content = infile.read()
            out = out + content
        out = out + "\n"
    return out

In [114]:
# read the files
# all_books = combine('../data/correct_BIO_output')

# with open('../data/all_books.txt', 'w', encoding='utf-8') as outfile:
#     outfile.write(all_books) 
    
tagset = read_tagset('NER_labels.txt')
with open("all_books.txt" ,encoding='utf-8') as file:
  all_books = file.read()

dataset = Dataset(all_books)

BATCH_SIZE = 25 #for stocastic gradient descent purpose

# train_batch_idx, train_batch_tags, train_batch_lens = train_dataset.get_batches(BATCH_SIZE, vocab, tagset)
# dev_batch_idx, dev_batch_tags, dev_batch_lens = dev_dataset.get_batches(BATCH_SIZE, vocab, tagset)
# test_batch_idx, test_batch_tags, test_batch_lens = test_dataset.get_batches(BATCH_SIZE, vocab, tagset)

In [115]:
print(len(dataset.chars))

8867


### BiLSTM Model

In [116]:
class BiLSTM(nn.Module):
    """
    An LSTM model for sequence labeling

    Initialization Arguments:
    - embeddings:   a matrix of size (vocab_size, emb_dim)
                  containing pretrained embedding weights
    - hidden_dim:   the LSTM's hidden layer size
    - tagset_size:  the number of possible output tags

    """
    def __init__(self, embeddings, hidden_dim, tagset_size, bidirectional_flag = True):
        super().__init__()

        self.hidden_dim = hidden_dim

        if bidirectional_flag:
          self.hidden_dim = hidden_dim // 2
    
        self.num_labels = tagset_size

        vocab_size = len(embeddings)
        embedding_dim = len(embeddings[0])
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight = nn.Parameter(embeddings)
        
        # Initialize an LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional = bidirectional_flag, batch_first=True, dropout=0.5, num_layers=3)

        # Initialize a single feedforward layer
        self.feedlayer = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, indices, lengths):
        """
        Runs a batched sequence through the model and returns output logits

        Arguments:
        - indices:  a matrix of size (batch_size x max_seq_len)
                    containing the word indices of sentences in the batch
        - lengths:  a vector of size (batch_size) containing the
                    original lengths of the sequences before padding

        Output:
        - logits:   a matrix of size (batch_size x max_seq_len x num_tags)
                    gives a score to each possible tag for each word
                    in each sentence 
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # cast arrays as PyTorch data types and move to GPU memory
        indices = torch.LongTensor(indices).to(device)
        lengths = torch.LongTensor(lengths).to(device)

        # convert word indices to word embeddings
        embeddings = self.embeddings(indices)

        # pack/pad handles variable length sequence batching
        # see here if you're curious: https://gist.github.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec
        packed_input_embs = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
        # run input through LSTM layer
        packed_output, _ = self.lstm(packed_input_embs)
        # unpack sequences into original format
        padded_output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)

        logits = self.feedlayer(padded_output)
        return logits

    def run_training(self, train_content, dev_content, batch_size, vocab, tagset,
                         lr=5e-4, num_epochs=100, eval_every=5):
        """
        Trains the model on the training data with a learning rate of lr
        for num_epochs. Evaluates the model on the dev data eval_every epochs.

        Arguments:
        - train_dataset:  Dataset object containing the training data
        - dev_dataset:    Dataset object containing the dev data
        - batch_size:     batch size for train/dev data
        - vocab:          a dictionary mapping word strings to indices
        - tagset:         a dictionary mapping tag strings to indices
        - lr:             learning rate
        - num_epochs:     number of epochs to train for
        - eval_every:     evaluation is run eval_every epochs
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#         if str(device) == 'cpu':
#             print("Training only supported in GPU environment")
#             return

        # clear unreferenced data/models from GPU memory 
        torch.cuda.empty_cache()
        # move model to GPU memory
        self.to(device)

        # set the optimizer (Adam) and loss function (CrossEnt)
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        loss_function = nn.CrossEntropyLoss(ignore_index=-100)

        # batch training and dev data
        train_batch_idx, train_batch_tags, train_batch_lens = train_content
        dev_batch_idx, dev_batch_tags, dev_batch_lens = dev_content

        print("**** TRAINING *****")
        for i in range(num_epochs):
            # sets the model in train mode
            self.train()

            total_loss = 0
            for b in range(len(train_batch_idx)):
                # compute the logits
                logits = self.forward(train_batch_idx[b], train_batch_lens[b])
                # move labels to GPU memory
                labels = torch.LongTensor(train_batch_tags[b]).to(device)
                # compute the loss with respect to true labels
                loss = loss_function(logits.view(-1, len(tagset)), labels.view(-1))
                total_loss += loss
                # propagate gradients backward
                loss.backward()
                optimizer.step()
                # set model gradients to zero before performing next forward pass
                self.zero_grad()

            # print("Epoch{}".format(i))

            if (i + 1) % eval_every == 0:
                # print("**** EVALUATION *****")
                # sets the model in evaluate mode (no gradients)
                self.eval()
                # compute dev f1 score
                true, pred = self.evaluate(dev_batch_idx, dev_batch_lens, dev_batch_tags, tagset)
                
        return true, pred

                
                
    def evaluate(self, batched_sentences, batched_lengths, batched_labels, tagset):
        """
        Evaluate the model's predictions on the provided dataset. 

        Arguments:
        - batched_sentences:  a list of matrices, each of size (batch_size x max_seq_len),
                              containing the word indices of sentences in the batch
        - batched_lengths:    a list of vectors, each of size (batch_size), containing the
                              original lengths of the sequences before padding
        - batched_labels:     a list of matrices, each of size (batch_size x max_seq_len),
                              containing the tag indices corresponding to sentences in the batch
        - num_tags:           the number of possible output tags

        Output:
        - accuracy:           the model's prediction accuracy
        - all_true_labels:    a flattened list of all true labels
        - all_predictions:    a flattened list of all of the model's corresponding predictions

        """

        all_true_labels = []
        all_predictions = []

        for b in range(len(batched_sentences)):
            logits = self.forward(batched_sentences[b], batched_lengths[b])
            batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()

            batch_size, _ = batched_sentences[b].shape

            for i in range(batch_size):
                tags = batched_labels[b][i]
                preds = batch_predictions[i]
                seq_len = int(batched_lengths[b][i])

                all_true_labels.append(list(tags[:seq_len]))
                all_predictions.append(list(preds[:seq_len]))

        return all_true_labels, all_predictions, 

In [117]:
def tagset_to_bio(arr):
    """
    conver back from tack integer to tag string for evaluation purpose
    """
    reversed_tagset = {v: k for k,v in tagset.items()}
    for i in range(len(arr)):
      for j in range(len(arr[i])):
        arr[i][j] = reversed_tagset[arr[i][j]]



### Model Training & 10-Fold Cross Validation

In [120]:
#Perform K-fold Cross Validation and train the model
np.random.seed(100) 
shuffle = np.random.permutation(range(len(dataset.chars)))

chars = [dataset.chars[i] for i in shuffle]
length = int(len(chars) / 12)
tags = [dataset.tags[i] for i in shuffle]
dataset.tags_chunk = []
dataset.chars_chunk = []


#Divide data into 10 chunks
for k in range(12):
    if k != 11:
        fold_chars = chars[length*k : length*(k + 1)]
        fold_tags = tags[length*k : length*(k + 1)]
        dataset.chars_chunk.append(fold_chars)
        dataset.tags_chunk.append(fold_tags)
    else:
        fold_chars = chars[length*k :]
        fold_tags = tags[length*k :]
        dataset.chars_chunk.append(fold_chars)
        dataset.tags_chunk.append(fold_tags)
        

#Loop 10 times, each time training with different chunks
HIDDEN_SIZE = 128
VALIDATION_FOLDS = 10
accuracy = []
per_class_accuracy = []
models = []
true_array = []
pred_array = []
print("============ K-Fold ============")
for k in range(VALIDATION_FOLDS):
    print("#Fold: {}".format(k))
    train_batch_idx, train_batch_tags, train_batch_lens = dataset.get_batches(BATCH_SIZE, vocab, tagset, 0, 10, k)
    dev_batch_idx, dev_batch_tags, dev_batch_lens = dataset.get_batches(BATCH_SIZE, vocab, tagset, k, k + 1, -1)
    train_content = (train_batch_idx, train_batch_tags, train_batch_lens)
    dev_content = (dev_batch_idx, dev_batch_tags, dev_batch_lens)
    
    # intialize a new BiLSTM model
    model = BiLSTM(embeddings, HIDDEN_SIZE, len(tagset))
    # train the model
    true, pred, = model.run_training(train_content, dev_content, BATCH_SIZE, vocab, tagset,   
                       lr=5e-4, num_epochs=25, eval_every=5)
    models.append(model)
    tagset_to_bio(true)
    tagset_to_bio(pred)

    true_array = np.append(true_array, true)
    pred_array = np.append(pred_array, pred)
    f1 = f1_score(true, pred)
    accuracy.append(f1)
    per_class_accuracy.append(classification_report(true, pred).split("\n"))
    print("f1_score: ", f1)
    print("================================")
    
    
test_batch_idx, test_batch_tags, test_batch_lens = dataset.get_batches(BATCH_SIZE, vocab, tagset, 10, 12, -1)

#Fold: 0
**** TRAINING *****
f1_score:  0.6369593709043251
#Fold: 1
**** TRAINING *****
f1_score:  0.6181369524984577
#Fold: 2
**** TRAINING *****
f1_score:  0.6179987413467589
#Fold: 3
**** TRAINING *****
f1_score:  0.6253041362530414
#Fold: 4
**** TRAINING *****
f1_score:  0.6570048309178744
#Fold: 5
**** TRAINING *****
f1_score:  0.6326662362814718
#Fold: 6
**** TRAINING *****
f1_score:  0.5966016362492133
#Fold: 7
**** TRAINING *****
f1_score:  0.5866477272727273
#Fold: 8
**** TRAINING *****
f1_score:  0.6140552995391705
#Fold: 9
**** TRAINING *****
f1_score:  0.6154684095860566


### Validation Result

In [138]:
# Compute final precision/recall/f1-score
f1_score(true_array, pred_array)

0.6187238258206872

In [139]:
classification_report(true_array, pred_array).split("\n")

['              precision    recall  f1-score   support',
 '',
 '         ART       0.60      0.56      0.58       380',
 '       EVENT       0.31      0.22      0.26        73',
 '         FAC       0.32      0.28      0.30       382',
 '         GPE       0.59      0.54      0.56       771',
 '    LANGUAGE       0.12      0.12      0.12        42',
 '         LAW       0.00      0.00      0.00         2',
 '         LOC       0.29      0.27      0.28       421',
 '        NORP       0.46      0.60      0.52       516',
 '         ORG       0.23      0.17      0.19       192',
 '         PER       0.69      0.67      0.68      8459',
 '     PRODUCT       0.16      0.04      0.06       104',
 '',
 '   micro avg       0.63      0.61      0.62     11342',
 '   macro avg       0.34      0.31      0.32     11342',
 'weighted avg       0.63      0.61      0.62     11342',
 '']

### Test set Evaluation Result

In [140]:
def evaluate_on_test():
  model = models[np.argmax(accuracy)]
  true, pred = model.evaluate(test_batch_idx, test_batch_lens, test_batch_tags, tagset)
  tagset_to_bio(true)
  tagset_to_bio(pred)
  f1 = f1_score(true, pred)
  report = classification_report(true, pred).split("\n")
  return f1, report

In [141]:
f1, report = evaluate_on_test()
f1

0.6385093167701863

In [142]:
report

['              precision    recall  f1-score   support',
 '',
 '         ART       0.59      0.62      0.61        16',
 '         FAC       0.24      0.32      0.27        19',
 '         GPE       0.70      0.76      0.73        42',
 '    LANGUAGE       0.17      0.11      0.13         9',
 '         LOC       0.25      0.24      0.24        17',
 '        NORP       0.45      0.57      0.51        42',
 '         ORG       0.00      0.00      0.00         4',
 '         PER       0.70      0.68      0.69       646',
 '     PRODUCT       0.00      0.00      0.00        18',
 '',
 '   micro avg       0.64      0.63      0.64       813',
 '   macro avg       0.34      0.37      0.35       813',
 'weighted avg       0.64      0.63      0.63       813',
 '']