# XLM-Roberta Named Entity Recognition

## How to start 

In [1]:
# Optional: You can mount google drive to save your checkpoints in it: 

# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Optional: remove previous files if you want to run it again without any problem 

# ! rm -rf drive
# ! rm -rf model
# ! rm -rf sample_data


# Remove data folder if already exists
! rm -rf data

### Paths and Parameters

In [32]:
DEBUG = False # Debug mode: If True then training set will be limited to MAX_EXAMPLES
MAX_EXAMPLES = 1000 # Size of Debug Set
MAX_EPOCHS = 15 # Max Number of epoochs 
CHECKPOINT_PATH = "./drive/MyDrive" # Root directory of checkpoints 
MODEL_PATH = "./model" # Root directory of pytorch model
MODEL_NAME = "model.pt" # name of final pytorch model

### Install Libraries

In [None]:
! pip install pytorch-lightning 
! pip install pytorch-crf # Crf layer
! pip install transformers
! pip install seqeval # Evaluation
! pip install gdown # Downlowd from google drive

In [5]:
# ! gdown --id ID

### Download Datasets

In [None]:
# Create data folder and donwload required datasets
! mkdir data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/en_test.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/en_train.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/fa_test.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/fa_train.csv -P ./data

### Import Libraries

In [7]:
# import primitive libraries
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import json

# import seqval to report classifier performance metrics
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score
from seqeval.scheme import IOB2

# import torch related modules
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

# import pytorch lightning library
import pytorch_lightning as pl
from torchcrf import CRF as SUPERCRF

# import NLTK to create better tokenizer
import nltk
from nltk.tokenize import RegexpTokenizer

# Transformers : Roberta Model
from transformers import XLMRobertaTokenizerFast
from transformers import XLMRobertaModel, XLMRobertaConfig

# import sklearn inorder to split data into train-evaluation-test
from sklearn.model_selection import GroupShuffleSplit

# import Typings
from typing import Union,Dict,List,Tuple,Any,Optional

In [8]:
# for sent tokenizer (nltk)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load Data

In [9]:
# A function to load dataset into dataframe
def load_data(name, path='./data', test=False):
    print(f'Processing {name}')
    # Read CSV
    df = pd.read_csv(path + '/' + name)
    # Define columns
    df.columns = ['index', 'Token', 'Tag']
    df.set_index('index', drop=True, inplace=True)
    # Remove rows which starts with # id
    mask = df.Token.str.startswith('# id')
    indices = list(map(lambda x:x+1, df.index[mask].tolist())) + [df.index[-1]]
    # Assign Sent number to each row
    sentence = 0
    df['Sent'] = None
    for index in tqdm(range(len(indices)-1)):
        df.loc[indices[index]:indices[index+1], 'Sent'] = sentence
        sentence += 1
    df.drop(df.index[mask], inplace=True)
    # Drop Tag column if Dataframe is for test
    if test:
      df.drop(columns='Tag', inplace=True)
    return df


In [10]:
# EN Data
en_data = load_data('en_train.csv')
# clean tag column
en_data['Tag'] = en_data['Tag'].apply(lambda tag: tag.strip() if tag!=' _ O' else 'O')
# clean Token column
en_data['Token'] = en_data['Token'].apply(lambda token: token.strip())

# EN Test (Deploy)
en_deploy_test = load_data('en_test.csv', test=True)

# FA Data 
fa_data = load_data('fa_train.csv')
# clean tag column
fa_data['Tag'] = fa_data['Tag'].apply(lambda tag: tag.strip() if tag!=' _ O' else 'O')
# clean Token column
fa_data['Token'] = fa_data['Token'].apply(lambda token: token.strip())

# FA TEST (Desploy)
fa_deploy_test = load_data('fa_test.csv', test=True)

Processing en_train.csv


100%|██████████| 15300/15300 [01:03<00:00, 239.27it/s]


Processing en_test.csv


100%|██████████| 800/800 [00:00<00:00, 901.95it/s]


Processing fa_train.csv


100%|██████████| 15300/15300 [01:01<00:00, 248.25it/s]


Processing fa_test.csv


100%|██████████| 800/800 [00:00<00:00, 1764.15it/s]


In [11]:
# Debug mode in order to reduce time consumption.
if DEBUG ==  True:
    en_data = en_data[:MAX_EXAMPLES]
    fa_data = fa_data[:MAX_EXAMPLES]

In [12]:
# Ratio of Val+Test to Training set
VAL_TEST_SIZE = 0.3
# Ratio of Test to Val set
TEST_SIZE = 0.3

### EN Dataframes

In [13]:
# Split Dataframe into Train and Val+Test
splitter = GroupShuffleSplit(test_size=VAL_TEST_SIZE, n_splits=1,  random_state = 42)
split = splitter.split(en_data, groups=en_data['Sent'])
en_train_inds, en_val_test_inds = next(split)

# En Train and Val+Test
en_train = en_data.iloc[en_train_inds]
en_val_test = en_data.iloc[en_val_test_inds]

# Split Dataframe into Val and Test
splitter = GroupShuffleSplit(test_size=TEST_SIZE, n_splits=1,  random_state = 42)
split = splitter.split(en_val_test, groups=en_val_test['Sent'])
en_val_inds, en_test_inds = next(split)

# En Val and Test
en_val = en_val_test.iloc[en_val_inds]
en_test = en_val_test.iloc[en_test_inds]

In [14]:
en_train.head()

Unnamed: 0_level_0,Token,Tag,Sent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,his,O,0
2,playlist,O,0
3,includes,O,0
4,sonny,B-PER,0
5,sharrock,I-PER,0


### FA Dataframes

In [15]:
# Split Dataframe into Train and Val+Test
splitter = GroupShuffleSplit(test_size=VAL_TEST_SIZE, n_splits=2,  random_state = 42)
split = splitter.split(fa_data, groups=fa_data['Sent'])
fa_train_inds, fa_val_test_inds = next(split)

# FA Train and Val+Test
fa_train = fa_data.iloc[fa_train_inds]
fa_val_test = fa_data.iloc[fa_val_test_inds]

# Split Dataframe into Val and Test
splitter = GroupShuffleSplit(test_size=TEST_SIZE, n_splits=2,  random_state = 42)
split = splitter.split(fa_val_test, groups=fa_val_test['Sent'])
fa_val_inds, fa_test_inds = next(split)

# FA Val and Test
fa_val = fa_val_test.iloc[fa_val_inds]
fa_test = fa_val_test.iloc[fa_test_inds]

In [16]:
fa_train.head()

Unnamed: 0_level_0,Token,Tag,Sent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,مردی,B-CW,0
2,برای,I-CW,0
3,تمام,I-CW,0
4,فصول,I-CW,0
5,–,O,0


### Create tags table

In [17]:
# Create list of name entity tags in train files
tags_table = list(set(list (en_train.Tag) + list(fa_train.Tag)))

### Create Proper Input Data

In [18]:
# Create proper input files
def write_file(en_df, fa_df, name, path='./data'):
    # Get the first sentence. The parameter will be updated during execution of function and show the last processed sentence number.
    last_sent = en_df.iloc[0].Sent

    with open(f'{path}/{name}.txt', 'a') as writer:
      # Write English (Token, Tag) pairs
        for index, row in tqdm(en_df.iterrows(), desc = f'Writing English {name} file'):  
            token, tag, current_sent = row

            if (current_sent == last_sent):
                writer.write(f"{token}\t{tag}\n")
            else:
                last_sent = current_sent
                writer.write(f"\n")
                writer.write(f"{token}\t{tag}\n")

        # Write Farsi (Token, Tag) pairs
        last_row_df = fa_df.iloc[-1].Sent

        for index, row in tqdm(fa_df.iterrows(), desc = f'Writing Farsi {name} file'):  
            token, tag, current_sent = row

            if current_sent!=last_row_df:
                if (current_sent == last_sent):
                    writer.write(f"{token}\t{tag}\n")

                else:
                    last_sent = current_sent
                    writer.write(f"\n")
                    writer.write(f"{token}\t{tag}\n")
            else:
                # Last Sentence of file without newline char
                writer.write(f"\n")
                writer.write(f"{token}\t{tag}")

In [19]:
# (Re)produce train test val files
! rm -rf ./data/train.txt
write_file(en_train, fa_train, 'train')
! rm -rf ./data/test.txt
write_file(en_test, fa_test, 'test')
! rm -rf ./data/val.txt
write_file(en_val, fa_val, 'val')

Writing English train file: 178156it [00:00, 22317.00it/s]
Writing Farsi train file: 195426it [00:00, 19096.39it/s]
Writing English test file: 22681it [00:00, 14925.52it/s]
Writing Farsi test file: 24682it [00:00, 18334.04it/s]
Writing English val file: 52702it [00:00, 18475.23it/s]
Writing Farsi val file: 58139it [00:00, 16509.91it/s]


### Create Individual Test File

In [20]:
def write_individual_test(df, name, path='./data'):
    # Get the first sentence. The parameter will be updated during execution of function and show the last processed sentence number.
    last_sent = df.iloc[0].Sent

    with open(f'{path}/{name}.txt', 'a') as writer:
        last_row_df = df.iloc[-1].Sent

        for index, row in tqdm(df.iterrows(), desc = f'Writing {name} file'):  
            token, tag, current_sent = row

            if current_sent!=last_row_df:
                if (current_sent == last_sent):
                    writer.write(f"{token}\t{tag}\n")

                else:
                    last_sent = current_sent
                    writer.write(f"\n")
                    writer.write(f"{token}\t{tag}\n")
            else:
                # Last Sentence of file without newline char
                writer.write(f"\n")
                writer.write(f"{token}\t{tag}")

In [21]:
! rm -rf ./data/fa_test.txt
write_individual_test(fa_test,'fa_test')

! rm -rf ./data/en_test.txt
write_individual_test(en_test,'en_test')

Writing fa_test file: 24682it [00:00, 13751.82it/s]
Writing en_test file: 22681it [00:00, 18444.74it/s]


## XLM-Roberta 

### TokenFromSubtoken
- Code adapted from https://github.com/deepmipt/DeepPavlov/blob/master/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py
- DeepPavlov is an popular open source library for deep learning end-to-end dialog systems and chatbots.
- Licensed under the Apache License, Version 2.0 (the "License");


In [22]:
class TokenFromSubtoken(torch.nn.Module):

    def forward(self, units: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """ Assemble token level units from subtoken level units
        Args:
            units: torch.Tensor of shape [batch_size, SUBTOKEN_seq_length, n_features]
            mask: mask of token beginnings. For example: for tokens
                    [[``[CLS]`` ``My``, ``capybara``, ``[SEP]``],
                    [``[CLS]`` ``Your``, ``aar``, ``##dvark``, ``is``, ``awesome``, ``[SEP]``]]
                the mask will be
                    [[0, 1, 1, 0, 0, 0, 0],
                    [0, 1, 1, 0, 1, 1, 0]]
        Returns:
            word_level_units: Units assembled from ones in the mask. For the
                example above this units will correspond to the following
                    [[``My``, ``capybara``],
                    [``Your`, ``aar``, ``is``, ``awesome``,]]
                the shape of this tensor will be [batch_size, TOKEN_seq_length, n_features]
        """
        
        device = units.device
        nf_int = units.size()[-1]
        batch_size = units.size()[0]

        # number of TOKENS in each sentence
        token_seq_lengths = torch.sum(mask, 1).to(torch.int64)
        # number of words
        n_words = torch.sum(token_seq_lengths)
        # max token seq len
        max_token_seq_len = torch.max(token_seq_lengths)

        idxs = torch.stack(torch.nonzero(mask, as_tuple=True), dim=1)
        # padding is for computing change from one sample to another in the batch
        sample_ids_in_batch = torch.nn.functional.pad(input=idxs[:, 0], pad=[1, 0])
        
        a = (~torch.eq(sample_ids_in_batch[1:], sample_ids_in_batch[:-1])).to(torch.int64)
        
        # transforming sample start masks to the sample starts themselves
        q = a * torch.arange(n_words, device=device).to(torch.int64)
        count_to_substract = torch.nn.functional.pad(torch.masked_select(q, q.to(torch.bool)), [1, 0])

        new_word_indices = torch.arange(n_words, device=device).to(torch.int64) - count_to_substract[torch.cumsum(a, 0)]
        
        n_total_word_elements = max_token_seq_len*torch.ones_like(token_seq_lengths, device=device).sum()
        word_indices_flat = (idxs[:, 0] * max_token_seq_len + new_word_indices).to(torch.int64)
        #x_mask = torch.sum(torch.nn.functional.one_hot(word_indices_flat, n_total_word_elements), 0)
        #x_mask = x_mask.to(torch.bool)
        x_mask = torch.zeros(n_total_word_elements, dtype=torch.bool, device=device)
        x_mask[word_indices_flat] = torch.ones_like(word_indices_flat, device=device, dtype=torch.bool)
        # to get absolute indices we add max_token_seq_len:
        # idxs[:, 0] * max_token_seq_len -> [0, 0, 0, 1, 1, 2] * 2 = [0, 0, 0, 3, 3, 6]
        # word_indices_flat -> [0, 0, 0, 3, 3, 6] + [0, 1, 2, 0, 1, 0] = [0, 1, 2, 3, 4, 6]
        # total number of words in the batch (including paddings)
        # batch_size * max_token_seq_len -> 3 * 3 = 9
        # tf.one_hot(...) ->
        # [[1. 0. 0. 0. 0. 0. 0. 0. 0.]
        #  [0. 1. 0. 0. 0. 0. 0. 0. 0.]
        #  [0. 0. 1. 0. 0. 0. 0. 0. 0.]
        #  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
        #  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
        #  [0. 0. 0. 0. 0. 0. 1. 0. 0.]]
        #  x_mask -> [1, 1, 1, 1, 1, 0, 1, 0, 0]
        nonword_indices_flat = (~x_mask).nonzero().squeeze(-1)

        # get a sequence of units corresponding to the start subtokens of the words
        # size: [n_words, n_features]
        
        elements = units[mask.bool()]

        # prepare zeros for paddings
        # size: [batch_size * TOKEN_seq_length - n_words, n_features]
        paddings = torch.zeros_like(nonword_indices_flat, dtype=elements.dtype).unsqueeze(-1).repeat(1,nf_int).to(device)

        # tensor_flat -> [x, x, x, x, x, 0, x, 0, 0]
        tensor_flat_unordered = torch.cat([elements, paddings])
        _, order_idx = torch.sort(torch.cat([word_indices_flat, nonword_indices_flat]))
        tensor_flat = tensor_flat_unordered[order_idx]

        tensor = torch.reshape(tensor_flat, (-1, max_token_seq_len, nf_int))
        # tensor -> [[x, x, x],
        #            [x, x, 0],
        #            [x, 0, 0]]

        return tensor


### Conditional Random Field 
- Code adopted form torchcrf library (https://pytorch-crf.readthedocs.io/en/stable/)
- we override veiterbi decoder in order to make it compatible with our code 

In [23]:
class CRF(SUPERCRF):

    # override veiterbi decoder in order to make it compatible with our code 
    def _viterbi_decode(self, emissions: torch.FloatTensor,
                        mask: torch.ByteTensor) -> List[List[int]]:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.shape[:2] == mask.shape
        assert emissions.size(2) == self.num_tags
        assert mask[0].all()

        seq_length, batch_size = mask.shape

        # Start transition and first emission
        # shape: (batch_size, num_tags)
        score = self.start_transitions + emissions[0]
        history = []

        # score is a tensor of size (batch_size, num_tags) where for every batch,
        # value at column j stores the score of the best tag sequence so far that ends
        # with tag j
        # history saves where the best tags candidate transitioned from; this is used
        # when we trace back the best tag sequence

        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
        # for every possible next tag
        for i in range(1, seq_length):
            # Broadcast viterbi score for every possible next tag
            # shape: (batch_size, num_tags, 1)
            broadcast_score = score.unsqueeze(2)

            # Broadcast emission score for every possible current tag
            # shape: (batch_size, 1, num_tags)
            broadcast_emission = emissions[i].unsqueeze(1)

            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
            # for each sample, entry at row i and column j stores the score of the best
            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
            # shape: (batch_size, num_tags, num_tags)
            next_score = broadcast_score + self.transitions + broadcast_emission

            # Find the maximum score over all possible current tag
            # shape: (batch_size, num_tags)
            next_score, indices = next_score.max(dim=1)

            # Set score to the next score if this timestep is valid (mask == 1)
            # and save the index that produces the next score
            # shape: (batch_size, num_tags)
            score = torch.where(mask[i].unsqueeze(1), next_score, score)
            history.append(indices)

        history = torch.stack(history, dim=0)

        # End transition score
        # shape: (batch_size, num_tags)
        score += self.end_transitions

        # Now, compute the best path for each sample

        # shape: (batch_size,)
        seq_ends = mask.long().sum(dim=0) - 1
        best_tags_list = []

        for idx in range(batch_size):
            # Find the tag which maximizes the score at the last timestep; this is our best tag
            # for the last timestep
            _, best_last_tag = score[idx].max(dim=0)
            best_tags = [best_last_tag]

            # We trace back where the best last tag comes from, append that to our best tag
            # sequence, and trace it back again, and so on
            for i, hist in enumerate(torch.flip(history[:seq_ends[idx]], dims=(0,))):
                best_last_tag = hist[idx][best_tags[-1]]
                best_tags.append(best_last_tag)

            best_tags = torch.stack(best_tags, dim=0)

            # Reverse the order because we start from the last timestep
            best_tags_list.append(torch.flip(best_tags, dims=(0,)))

        best_tags_list = nn.utils.rnn.pad_sequence(best_tags_list, batch_first=True, padding_value=0)

        return best_tags_list

### CRFLayer 
- Forward: decide output logits basaed on backbone network  
- Decode: decode based on CRF weights

In [24]:
class CRFLayer(nn.Module):
    def __init__(self, embedding_size, n_labels):

        super(CRFLayer, self).__init__()
        self.dropout = nn.Dropout(0.1)
        self.output_dense = nn.Linear(embedding_size,n_labels)
        self.crf = CRF(n_labels, batch_first=True)
        self.token_from_subtoken = TokenFromSubtoken()

    # Forward: decide output logits basaed on backbone network  
    def forward(self, embedding, mask):
        logits = self.output_dense(self.dropout(embedding))
        logits = self.token_from_subtoken(logits, mask)
        pad_mask = self.token_from_subtoken(mask.unsqueeze(-1), mask).squeeze(-1).bool()
        return logits, pad_mask

    # Decode: decode based on CRF weights 
    def decode(self, logits, pad_mask):
        return self.crf.decode(logits, pad_mask)

    # Evaluation Loss: calculate mean log likelihood of CRF layer
    def eval_loss(self, logits, targets, pad_mask):
        mean_log_likelihood = self.crf(logits, targets, pad_mask, reduction='sum').mean()
        return -mean_log_likelihood


### NERModel

In [25]:
class NERModel(nn.Module):

    def __init__(self, n_labels:int, roberta_path:str):
        super(NERModel,self).__init__()
        self.roberta = XLMRobertaModel.from_pretrained(roberta_path)
        self.crf = CRFLayer(self.roberta.config.hidden_size, n_labels)

    # Forward: pass embedings to CRF layer in order to evaluate logits from suboword sequence
    def forward(self, 
                input_ids:torch.Tensor,
                attention_mask:torch.Tensor,
                token_type_ids:torch.Tensor,
                mask:torch.Tensor) -> torch.Tensor:

        embedding = self.roberta(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)[0]
        logits, pad_mask = self.crf(embedding, mask)
        return logits, pad_mask

    # Disable Gradient and Predict with model
    @torch.no_grad()
    def predict(self, inputs:Tuple[torch.Tensor]) -> torch.Tensor:
        input_ids, attention_mask, token_type_ids, mask = inputs
        logits, pad_mask = self(input_ids, attention_mask, token_type_ids, mask)
        decoded = self.crf.decode(logits, pad_mask)
        return decoded, pad_mask

    # Decode: pass to crf decoder and decode based on CRF weights 
    def decode(self, logits, pad_mask):
        """Decode logits using CRF weights 
        """
        return self.crf.decode(logits, pad_mask) 

    # Evaluation Loss: pass to crf eval_loss and calculate mean log likelihood of CRF layer
    def eval_loss(self, logits, targets, pad_mask):
        return self.crf.eval_loss(logits, targets, pad_mask)

    # Determine number of layers to be fine-tuned (!freeze) 
    def freeze_roberta(self, n_freeze:int=6):
        for param in self.roberta.parameters():
            param.requires_grad = False

        for param in self.roberta.encoder.layer[n_freeze:].parameters():
            param.requires_grad = True

### NERTokenizer
- NLTK tokenizer along with XLMRobertaTokenizerFast tokenizer

In [26]:
class NERTokenizer(object):

    MAX_LEN=512
    BATCH_LENGTH_LIMT = 380 # Max number of roberta tokens in one sentence.

    # Modified version of http://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
    PATTERN = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A. or U.S.A # 
      | (?:\d+\.)           # numbers
      | \w+(?:[-.]\w+)*     # words with optional internal hyphens
      | \$?\d+(?:.\d+)?%?   # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis, and special chars below, includes ], [
      | [-\]\[.,;"'?():_`“”/°º‘’″…#$%()*+<>=@\\^_{}|~❑&§]
    '''

    def __init__(self, base_model:str, to_device:str='cpu'):
        super(NERTokenizer,self).__init__()
        self.roberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(base_model, do_lower_case=False)
        self.to_device = to_device

        self.word_tokenizer = RegexpTokenizer(self.PATTERN)
        self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # tokenize batch of tokens
    def tokenize_batch(self, inputs, pad_to = None) -> torch.Tensor:
        batch = [inputs] if isinstance(inputs[0], str) else inputs

        input_ids, attention_mask, token_type_ids, mask = [], [], [], []
        for tokens in batch:
            input_ids_tmp, attention_mask_tmp, token_type_ids_tmp, mask_tmp = self._tokenize_words(tokens)
            input_ids.append(input_ids_tmp)
            attention_mask.append(attention_mask_tmp)
            token_type_ids.append(token_type_ids_tmp)
            mask.append(mask_tmp)

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.roberta_tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
        mask = pad_sequence(mask, batch_first=True, padding_value=0)

        # truncate MAX_LEN
        if input_ids.shape[-1]>self.MAX_LEN:
            input_ids = input_ids[:,:,:self.MAX_LEN]
            attention_mask = attention_mask[:,:,:self.MAX_LEN]
            token_type_ids = token_type_ids[:,:,:self.MAX_LEN]
            mask = mask[:,:,:self.MAX_LEN]
        
        # extend pad 
        elif pad_to is not None and pad_to>input_ids.shape[1]:
            bs = input_ids.shape[0]
            padlen = pad_to-input_ids.shape[1]
            input_ids_append = torch.tensor([self.roberta_tokenizer.pad_token_id], dtype=torch.long).repeat([bs, padlen]).to(self.to_device)
            input_ids = torch.cat([input_ids, input_ids_append], dim=-1)
            attention_mask_append = torch.tensor([0], dtype=torch.long).repeat([bs, padlen]).to(self.to_device)
            attention_mask = torch.cat([attention_mask, attention_mask_append], dim=-1)
            token_type_ids_append = torch.tensor([0], dtype=torch.long).repeat([bs, padlen]).to(self.to_device)
            token_type_ids = torch.cat([token_type_ids, token_type_ids_append], dim=-1)
            mask_append = torch.tensor([0], dtype=torch.long).repeat([bs, padlen]).to(self.to_device)
            mask = torch.cat([mask, mask_append], dim=-1)

        # truncate pad
        elif pad_to is not None and pad_to<input_ids.shape[1]:
            input_ids = input_ids[:,:,:pad_to]
            attention_mask = attention_mask[:,:,:pad_to]
            token_type_ids = token_type_ids[:,:,:pad_to]
            mask = mask[:,:,:pad_to]

        if isinstance(inputs[0], str):
            return input_ids[0], attention_mask[0], token_type_ids[0], mask[0]
        else:
            return input_ids, attention_mask, token_type_ids, mask

    # tokenize list of words with roberta tokenizer
    def _tokenize_words(self, words):
        subtokenized = []
        mask = []
        for word in words:
            subtokens = self.roberta_tokenizer.tokenize(word)
            subtokenized+=subtokens
            n_subtoken = len(subtokens)
            if n_subtoken>=1:
                mask = mask + [1] + [0]*(n_subtoken-1)

        subtokenized = [self.roberta_tokenizer.cls_token] + subtokenized + [self.roberta_tokenizer.sep_token]
        mask = [0] + mask + [0]

        input_ids = torch.tensor(self.roberta_tokenizer.convert_tokens_to_ids(subtokenized), dtype=torch.long).to(self.to_device)
        attention_mask = torch.ones(len(mask), dtype=torch.long).to(self.to_device)
        token_type_ids = torch.zeros(len(mask), dtype=torch.long).to(self.to_device)
        mask = torch.tensor(mask, dtype=torch.long).to(self.to_device)

        return input_ids, attention_mask, token_type_ids, mask

    # sent_to_token: yield each sentence token with positional span using nltk
    def sent_to_token(self, raw_text):
        for offset, ending in self.sent_tokenizer.span_tokenize(raw_text):
            sub_text = raw_text[offset:ending]
            words, spans = [], []
            flush = False
            total_subtoken = 0
            for start, end in self.word_tokenizer.span_tokenize(sub_text):
                flush = True
                start += offset
                end += offset
                words.append(raw_text[start:end])
                spans.append((start,end))
                total_subtoken += len(self.roberta_tokenizer.tokenize(words[-1]))
                if (total_subtoken > self.BATCH_LENGTH_LIMT): 
                    yield words[:-1],spans[:-1]
                    spans = spans[len(spans)-1:]
                    words = words[len(words)-1:]
                    total_subtoken = sum([len(self.roberta_tokenizer.tokenize(word)) for word in words])
                    flush = False

            if flush and len(spans) > 0:
                yield words,spans

    # Extract (batch words span() from a raw sentence
    def prepare_row_text(self, raw_text, batch_size=16):
        words_list, spans_list = [], []
        end_batch = False
        for words, spans in self.sent_to_token(raw_text):
            end_batch = True
            words_list.append(words)
            spans_list.append(spans)
            if len(spans_list) >= batch_size:
                input_ids, attention_mask, token_type_ids, mask = self.tokenize_batch(words_list)
                yield (input_ids, attention_mask, token_type_ids, mask), words_list, spans_list
                words_list, spans_list = [], []
        if end_batch and len(words_list) > 0:
            input_ids, attention_mask, token_type_ids, mask = self.tokenize_batch(words_list)
            yield (input_ids, attention_mask, token_type_ids, mask), words_list, spans_list


### NERDataset
- Pythorch Compatible Dataset and Dataloader

In [27]:
# Pytorch Dataset
class NERDataset(Dataset):

    def __init__(self, data_path:str, label_tags: List[str], base_model:str="xlm-roberta-base", 
            default_label=0, max_length:int=512, to_device="cpu"):
        self.tokenizer = NERTokenizer(base_model=base_model, to_device=to_device)
        self.label_tags = label_tags        
        self.name_to_label = {x: i for i, x in enumerate(self.label_tags)}
        self.default_label = default_label
        self.max_length = max_length


        # open file (train, test or val)
        with open(data_path,'r') as f:
            data_text = f.read()
        self.data = []

        # the loop notices the change of sentence with double newline 
        for sentence in filter(lambda x: len(x)>2, data_text.split('\n\n')):
            sample = []
            # each word laid in sepertaed lines
            for wordline in sentence.split('\n'):
                if wordline=='':
                    continue
                # the word and label are seperated from each other with tab 
                word, label = wordline.split('\t')
                sample.append((word, label))
            self.data.append(sample)

    # len of dataset
    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        item = self.data[idx]
        words, labels = list(zip(*item))
        
        labels_idx = [self.name_to_label.get(x, self.default_label) for x in labels]  
        y = torch.tensor(labels_idx, dtype=torch.long)
        diff = self.max_length - y.shape[-1]
        y = torch.nn.functional.pad(y, (0, diff), value=self.default_label)
        X = self.tokenizer.tokenize_batch(list(words), pad_to=self.max_length)

        return X, y 

### NERWrapper
- Lightining Wrapper for NER Model

In [28]:
# Lightining Wrapper for NER Model

class NERWrapper(pl.LightningModule):
    def __init__(self,
        learning_rate = 2e-5,
        weight_decay = 0.0,
        batch_size = 16,
        freeze_layers = 8,
        tags = tags_table,
        train_path = "./data/train.txt",
        val_path = "./data/val.txt",
        test_path ="./data/test.txt",
        pretrained_path = None,
        *args, **kwargs
    ):
        
        super(NERWrapper,self).__init__()
        self.save_hyperparameters('learning_rate', 'weight_decay', 'batch_size')
        self.tags, self.train_path, self.val_path, self.test_path = tags, train_path, val_path, test_path
        self.model = NERModel(n_labels=len(self.tags), roberta_path="xlm-roberta-base")

        if pretrained_path is not None:
            self.model.load_state_dict(torch.load(pretrained_path))
        self.model.freeze_roberta(freeze_layers)

    def forward(self, *args, **kwargs):
        return self.model.forward(*args, **kwargs)

    def _step(self, batch, batch_idx):
        (input_ids, attention_mask, token_type_ids, mask), labels = batch
        logits, pad_mask = self.model(input_ids, attention_mask, token_type_ids, mask)
        labels = labels[:, :logits.shape[1]]
        loss = self.model.eval_loss(logits, labels, pad_mask)
        preds_tag_idx = self.model.decode(logits, pad_mask)
        preds_tag = [[self.tags[start.item()] for m, start in zip(mask, sample) if m] for mask, sample in zip(pad_mask, preds_tag_idx)]
        labels_tag = [[self.tags[start.item()] for m, start in zip(mask, sample) if m] for mask, sample in zip(pad_mask, labels)]
        tensorboard_logs = {'batch_loss': loss}
        for metric, value in tensorboard_logs.items():
            self.log(metric, value, prog_bar=True)
        return {'loss': loss, "preds": preds_tag, "labels": labels_tag}

    def training_step(self, batch, batch_idx):
        return self._step(batch, batch_idx)['loss']

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        return self._step(batch, batch_idx)

    def test_step(self, batch, batch_idx, dataloader_idx=0):
        return self._step(batch, batch_idx)

    def configure_optimizers(self):
        no_decay_keywords = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n,p in self.model.named_parameters() if not any(nd in n for nd in no_decay_keywords)],
                "weight_decay_rate": self.hparams.weight_decay,
                "lr": self.hparams.learning_rate,
            },
            {
                "params": [p for n,p in self.model.named_parameters() if any(nd in n for nd in no_decay_keywords)],
                "weight_decay_rate": 0,
                "lr": self.hparams.learning_rate,
            }
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters)
        return optimizer

    def _dataloader(self, path, shuffle=False):
        dataset = NERDataset(path, label_tags = self.tags, default_label=0, to_device=self.device)
        return DataLoader(dataset, drop_last=False, shuffle=shuffle, batch_size=self.hparams.batch_size, 
            worker_init_fn=np.random.seed(0))

    def train_dataloader(self):
        return self._dataloader(self.train_path, True)

    def val_dataloader(self):
        return self._dataloader(self.val_path)

    def test_dataloader(self):
        return self._dataloader(self.test_path)


    def _epoch_end (self, outputs):
        preds = sum([x['preds'] for x in outputs], [])
        labels = sum([x['labels'] for x in outputs], [])
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        acc = accuracy_score(labels, preds)
        precision = precision_score(labels, preds, mode='strict', scheme=IOB2, average='micro', zero_division=1)
        recall = recall_score(labels, preds, mode='strict', scheme=IOB2, average='micro', zero_division=1)
        f1 = f1_score(labels, preds, mode='strict', scheme=IOB2, average='micro', zero_division=1)
        tensorboard_logs = {'loss': loss, 'acc': acc, 'precision': precision, 'recall': recall, 'f1': f1}
        return tensorboard_logs

    def validation_epoch_end(self, outputs):
        val_logs = self._epoch_end(outputs)
        tensorboard_logs = {'val_loss': val_logs['loss'], 'val_accuracy': val_logs['acc'], 'val_precision': val_logs['precision'], 'val_recall': val_logs['recall'], 'val_F1': val_logs['f1']}
        for metric, value in tensorboard_logs.items():
            self.log(metric, value, prog_bar=True)
 
    def test_epoch_end(self, outputs):
        test_logs = self._epoch_end(outputs)
        tensorboard_logs = {'test_loss': test_logs['loss'], 'test_accuracy': test_logs['acc'], 'test_precision': test_logs['precision'], 'test_recall': test_logs['recall'], 'test_F1': test_logs['f1']}
        for metric, value in tensorboard_logs.items():
            self.log(metric, value, prog_bar=True)


### NER
NER Interface

In [29]:
class NER(object):

    def __init__(self, model_path, model_name = MODEL_NAME, tags = tags_table):
        
        self.tags = tags
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        roberta_path = "xlm-roberta-base"
        self.model = NERModel(n_labels=len(self.tags), roberta_path=roberta_path).to(self.device)
        state_dict = torch.load(os.path.join(model_path, model_name))
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()
        self.tokenizer = NERTokenizer(base_model=roberta_path, to_device=self.device)

    @torch.no_grad()
    def __call__(self, raw_text):

        outputs_flat, spans_flat, entities = [], [], []
        for batch, words, spans in self.tokenizer.prepare_row_text(raw_text):
            output, pad_mask = self.model.predict(batch)
            outputs_flat.extend(output[pad_mask.bool()].reshape(-1).tolist())
            spans_flat += sum(spans, [])

        for tag_idx,(start,end) in zip(outputs_flat,spans_flat):
            tag = self.tags[tag_idx]
            if tag != 'O':
                entities.append({'Text': raw_text[start:end], 'Tag': tag})

        return entities

## Train, and Test Model

### Train

In [33]:
# train function 
def train_test(test_path):
    plmodel = NERWrapper(test_path = test_path)
    trainer = pl.Trainer(default_root_dir=CHECKPOINT_PATH, gpus=1, max_epochs=MAX_EPOCHS)
    trainer.fit(plmodel)
    result = trainer.test(plmodel)
    print(result)


In [34]:
# Train on both Fa and En and test on both En and Fa

train_test (test_path = "./data/test.txt")


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     | Params
-----------------------------------
0 | model | NERModel | 278 M 
-----------------------------------


Validation sanity check: 0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'batch_loss': 24.60813919067383,
 'test_F1': 0.7578543,
 'test_accuracy': 0.9484375,
 'test_loss': 60.608143005371094,
 'test_precision': 0.772465,
 'test_recall': 0.7423492}
--------------------------------------------------------------------------------
[{'batch_loss': 24.60813919067383, 'test_loss': 60.608143005371094, 'test_accuracy': 0.9484375, 'test_precision': 0.772465, 'test_recall': 0.7423492, 'test_F1': 0.7578543}]


### Save Model 

In [None]:
# PATH of Last checkpoint

# CKPT_PATH = CHECKPOINT_PATH + "/" + "lightning_logs/version_0/checkpoints/epoch=19-step=99.ckpt"

In [None]:
# def save_checkpint_to_pt(ckpt_path_file, model_path=MODEL_PATH, model_name=MODEL_NAME):
#     plmodule = NERWrapper()
#     plmodule.load_state_dict(torch.load(ckpt_path_file)['state_dict'])

#     if not os.path.exists(MODEL_PATH):
#         os.makedirs(MODEL_PATH)
#     torch.save(plmodule.model.state_dict(), os.path.join(MODEL_PATH, MODEL_NAME))

In [None]:
# Call save model
# save_checkpint_to_pt(ckpt_path_file=CKPT_PATH, model_path=MODEL_PATH, model_name=MODEL_NAME)

### Test with Interface

In [None]:
# del ner
ner = NER(model_path = MODEL_PATH, model_name = MODEL_NAME, tags = tags_table) # Load pretrained model.

In [None]:
text = """his playlist includes sonny sharrock , gza , country teasers and the notorious b.i.g."""
result = ner(text)
result