In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
import random ,json
import numpy as np


import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, RandomSampler, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torch.optim import AdamW
from tqdm import tqdm

# Setting Basic Parameters

In [2]:
class Config:
    batch_size = 4
    epochs = 1
    lr = 1e-5
    seed = 123
    lstm_layer_num = 0                             # adding custom layer; make this from 10 to 0
    bi_lstm=False

    # Internet resource; download from Internet
    # model_name = "microsoft/deberta-v3-base"
    model_name = "microsoft/deberta-base"

    hidden_size=768
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_data_name = "conll2003"                  # Setting the databset

    @classmethod
    def describe(cls):
        parm = {"train_data_name": cls.train_data_name,
                "encoder_name": cls.model_name,
                "batch_size": cls.batch_size,
                "epochs": cls.epochs,
                "lr": cls.lr,
                "seed": cls.seed,
                "bi_lstm": cls.bi_lstm,
                "lstm_layer_num": cls.lstm_layer_num}
        return json.dumps(parm , ensure_ascii=False, indent=2)

random.seed(Config.seed)
np.random.seed(Config.seed)
torch.manual_seed(Config.seed)
torch.cuda.manual_seed_all(Config.seed)

In [3]:
# Given configuration result

In [3]:
print(Config.describe())

{
  "train_data_name": "conll2003",
  "encoder_name": "microsoft/deberta-base",
  "batch_size": 4,
  "epochs": 1,
  "lr": 1e-05,
  "seed": 123,
  "bi_lstm": false,
  "lstm_layer_num": 0
}


### Import Dataset

# Import Dataset

In [4]:
def read_conll2003(file_path):
    data = []
    sample = []
    for idx, line in enumerate(open(file_path)):
        if idx == 0:
            continue
        line = line.strip()
        if line == "":
            if len(sample) != 0:
                data.append(sample)
            sample = []
        else:
            line = line.split()
            assert len(line) == 2
            sample.append([line[0], line[-1]])
    if len(sample) != 0:
        data.append(sample)
    data = [{"word": [i[0] for i in sample], "tag": [i[1] for i in sample]} for sample in data]
    return pd.DataFrame(data)


if Config.train_data_name == "conll2003":
    train_path = os.path.join(Config.train_data_name, 'tweets.train.txt')
    dev_path = os.path.join(Config.train_data_name, 'tweets.valid.txt')
    test_path = os.path.join(Config.train_data_name, 'tweets.test.txt')
    train_df = read_conll2003(train_path)
    valid_df = read_conll2003(dev_path)
    test_df = read_conll2003(test_path)
    print(train_df.shape, valid_df.shape, test_df.shape)
elif Config.train_data_name == "ner_datasetreference":
    df = pd.read_csv("ner_datasetreference.csv", encoding='iso-8859-1')
    data = []
    word, tag = [], []
    for i,j,k in zip(df['Sentence #'], df['Word'], df['Tag']):
        if not pd.isnull(i):
            assert i.startswith('Sentence')
            if len(word) > 0:
                data.append({"word":word, "tag":tag})
            word, tag = [], []
        if isinstance(j, str) and isinstance(k, str):
            word.append(j)
            tag.append(k)
    if len(word) > 0:
        data.append({"word":word, "tag":tag})
        word, tag = [], []
    print(data[0], data[-1])
    df = pd.DataFrame(data)
    train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    print(df.shape, train_df.shape, valid_df.shape, test_df.shape)

(3967, 2) (1301, 2) (1303, 2)


In [5]:
train_df

Unnamed: 0,word,tag
0,"[Just, received, word, that, UHVictoria, and, ...","[O, O, O, O, B-POINT, O, B-POINT, I-POINT, O, ..."
1,"[-DOCSTART-, Dudes, from, Austin, drove, down,...","[O, O, O, B-AREA, O, O, O, O, O, O, O, O, O, O]"
2,"[-DOCSTART-, RT, @fbcoem, :, SHELTER, AT, SACR...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[-DOCSTART-, RT, @fbcoem, :, SHELTER, AT, SACR...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[-DOCSTART-, RT, @Stafford_PD, :, SACRED, HEAR...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
3962,"[-DOCSTART-, RT, @KPRC2, :, WATCH, LIVE, :, ht...","[O, O, O, O, O, O, O, O, O, O]"
3963,"[-DOCSTART-, @ukuleledan, 59, feet, is, the, f...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3964,"[-DOCSTART-, This, is, some, interesting, pers...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3965,"[-DOCSTART-, Officials, are, urging, everyone,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [6]:
test_df

Unnamed: 0,word,tag
0,"[BREAKING, :, One, firefighter, injured, after...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-POIN..."
1,"[-DOCSTART-, @Robert1288, Main, St, near, Med,...","[O, O, B-POINT, I-POINT, I-POINT, I-POINT, I-P..."
2,"[-DOCSTART-, Important, Message, from, Spring,...","[O, O, O, O, B-AREA, I-AREA, O, O, O, O, B-ARE..."
3,"[-DOCSTART-, RT, @kiii3news, :, Harvey, will, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[-DOCSTART-, RT, @KHOU, :, JUST, IN, :, Fort, ...","[O, O, O, O, O, O, O, B-AREA, I-AREA, I-AREA, ..."
...,...,...
1298,"[-DOCSTART-, Shutting, down, my, Twitter, for,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1299,"[-DOCSTART-, Si, necesita, rescate, en, el, co...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1300,"[-DOCSTART-, @XBrittanyDukeX, @taavi_rautavirt...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1301,"[-DOCSTART-, RT, @KPRC2, :, WATCH, LIVE, :, ht...","[O, O, O, O, O, O, O, O, O, O]"


In [7]:
valid_df

Unnamed: 0,word,tag
0,"[Food, Town, at, the, corner, of, Richey, St, ...","[O, O, O, O, B-POINT, I-POINT, I-POINT, I-POIN..."
1,"[-DOCSTART-, Conroe, calls, for, evacuations, ...","[O, B-AREA, O, O, O, O, O, O, O, O, O, B-AREA,..."
2,"[-DOCSTART-, I-10, and, Westside, ., Houston, ...","[O, O, O, O, O, O, O, O, O]"
3,"[-DOCSTART-, Neighborhoods, along, the, San, J...","[O, O, O, O, B-RIVER, I-RIVER, I-RIVER, O, O, ..."
4,"[-DOCSTART-, Water, Rescue, -, E065, -, Kings,...","[O, O, O, O, O, O, B-POINT, I-POINT, I-POINT, ..."
...,...,...
1296,"[-DOCSTART-, RT, @radioguycliff, :, Can, confi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1297,"[-DOCSTART-, Water, Rescue, -, E034, -, Hirsch...","[O, O, O, O, O, O, B-POINT, I-POINT, I-POINT, ..."
1298,"[-DOCSTART-, @XBrittanyDukeX, @taavi_rautavirt...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1299,"[-DOCSTART-, .@KPRCJonathan, just, reported, l...","[O, O, O, O, O, O, B-POINT, I-POINT, I-POINT, ..."


In [8]:
def collect_label(df_list):
    ret = set()
    for df in df_list:
        for labels in df['tag']:
            for l in labels:
                if l == "O":
                    continue
                assert l.startswith("B-") or l.startswith("I-")
                ret.add(l[2:])
    return sorted(list(ret))

ner_category = collect_label([train_df, valid_df, test_df])
label_list = []
for l in ner_category:
    label_list.append("B-" + l)
    label_list.append("I-" + l)
label_list = ['O'] + label_list
label2id = dict([(v, idx) for idx, v in enumerate(label_list)])
id2label = dict([(idx, v) for idx, v in enumerate(label_list)])
print(f"ner category {ner_category} .\n\nlabel list {label_list} .\n\nlabel2id {label2id} .\n\nid2label {id2label}\n\n")
label_list = label_list

ner category ['AREA', 'POINT', 'RIVER', 'ROAD'] .

label list ['O', 'B-AREA', 'I-AREA', 'B-POINT', 'I-POINT', 'B-RIVER', 'I-RIVER', 'B-ROAD', 'I-ROAD'] .

label2id {'O': 0, 'B-AREA': 1, 'I-AREA': 2, 'B-POINT': 3, 'I-POINT': 4, 'B-RIVER': 5, 'I-RIVER': 6, 'B-ROAD': 7, 'I-ROAD': 8} .

id2label {0: 'O', 1: 'B-AREA', 2: 'I-AREA', 3: 'B-POINT', 4: 'I-POINT', 5: 'B-RIVER', 6: 'I-RIVER', 7: 'B-ROAD', 8: 'I-ROAD'}




# Import Reberta Model

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.model_name, add_prefix_space=True)
print(tokenizer.is_fast)

True


## tokenize and build Label

In [10]:
def align(tag, word_ids):
    aligned_tag = []
    i = 0
    while i < len(word_ids):
        if word_ids[i] is None:
            aligned_tag.append(None)
            i += 1
        elif tag[word_ids[i]] == "O":
            aligned_tag.append(tag[word_ids[i]])
            i += 1
        elif tag[word_ids[i]].startswith("B-"):
            n = 0
            while (i+n) < len(word_ids) and word_ids[i]  == word_ids[i+n]:
                n += 1
            aligned_tag.append(tag[word_ids[i]])
            if n > 1:
                aligned_tag.extend(["I-" + tag[word_ids[i]][2:] ] * (n-1))
            i = i + n
        else:
            aligned_tag.append(tag[word_ids[i]])
            i += 1
    return aligned_tag

In [11]:
#words = train_df.iloc[2]["word"]
#tag = train_df.iloc[2]["label"]
words = ['I', '1996-08-22', '1996-08-22', 'I']
tag = ["O", "B-LOC", "B-ORG", "O"]
print(words, tag)
s = tokenizer(words, truncation=True, is_split_into_words=True)
word_ids = s.word_ids()
# align tokens and words
tokens = tokenizer.convert_ids_to_tokens(s['input_ids'])
tags = align(tag, s.word_ids())
print(pd.DataFrame(list(zip(tokens, tags, word_ids)), columns=['tokens', 'tags', 'word-index']))

['I', '1996-08-22', '1996-08-22', 'I'] ['O', 'B-LOC', 'B-ORG', 'O']
   tokens   tags  word-index
0   [CLS]   None         NaN
1      ĠI      O         0.0
2   Ġ1996  B-LOC         1.0
3       -  I-LOC         1.0
4      08  I-LOC         1.0
5       -  I-LOC         1.0
6      22  I-LOC         1.0
7   Ġ1996  B-ORG         2.0
8       -  I-ORG         2.0
9      08  I-ORG         2.0
10      -  I-ORG         2.0
11     22  I-ORG         2.0
12     ĠI      O         3.0
13  [SEP]   None         NaN


In [12]:
def preprocess(x):
    word = x['word']
    r = tokenizer(word, truncation=True, is_split_into_words=True)
    word_ids = r.word_ids()
    tokens = tokenizer.convert_ids_to_tokens(r['input_ids'])
    align_label = align(x['tag'], word_ids)
    return tokens, align_label, r['input_ids'], [label2id[i] if i is not None else -100  for i in align_label], word_ids


In [13]:
train_df[['token', 'label', 'id', 'label_id', 'word_ids']] = train_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)
valid_df[['token', 'label', 'id', 'label_id', 'word_ids']] = valid_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)
test_df[['token', 'label', 'id', 'label_id', 'word_ids']] = test_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)

In [14]:
test_df

Unnamed: 0,word,tag,token,label,id,label_id,word_ids
0,"[BREAKING, :, One, firefighter, injured, after...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-POIN...","[[CLS], ĠBRE, AKING, Ġ:, ĠOne, Ġfirefighter, Ġ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 13530, 16371, 4832, 509, 15788, 1710, 71, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11..."
1,"[-DOCSTART-, @Robert1288, Main, St, near, Med,...","[O, O, B-POINT, I-POINT, I-POINT, I-POINT, I-P...","[[CLS], Ġ-, DOC, ST, ART, -, Ġ@, Robert, 12, 8...","[None, O, O, O, O, O, O, O, O, O, B-POINT, I-P...","[1, 111, 46570, 4014, 11328, 12, 787, 25244, 1...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, ...","[None, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 4, 5, ..."
2,"[-DOCSTART-, Important, Message, from, Spring,...","[O, O, O, O, B-AREA, I-AREA, O, O, O, O, B-ARE...","[[CLS], Ġ-, DOC, ST, ART, -, ĠImportant, ĠMess...","[None, O, O, O, O, O, O, O, O, B-AREA, I-AREA,...","[1, 111, 46570, 4014, 11328, 12, 28997, 32236,...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, ..."
3,"[-DOCSTART-, RT, @kiii3news, :, Harvey, will, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[CLS], Ġ-, DOC, ST, ART, -, ĠRT, Ġ@, ki, ii, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 10541, 787, 3...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 4, ..."
4,"[-DOCSTART-, RT, @KHOU, :, JUST, IN, :, Fort, ...","[O, O, O, O, O, O, O, B-AREA, I-AREA, I-AREA, ...","[[CLS], Ġ-, DOC, ST, ART, -, ĠRT, Ġ@, K, HOU, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 10541, 787, 5...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 4, 5, 6, ..."
...,...,...,...,...,...,...,...
1298,"[-DOCSTART-, Shutting, down, my, Twitter, for,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[CLS], Ġ-, DOC, ST, ART, -, ĠShut, ting, Ġdow...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 36707, 2577, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, ..."
1299,"[-DOCSTART-, Si, necesita, rescate, en, el, co...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[CLS], Ġ-, DOC, ST, ART, -, ĠSi, Ġne, ces, it...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 11065, 3087, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 2, 2, 3, 3, 4, 5, ..."
1300,"[-DOCSTART-, @XBrittanyDukeX, @taavi_rautavirt...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[CLS], Ġ-, DOC, ST, ART, -, Ġ@, X, Br, itt, a...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 787, 1000, 16...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1301,"[-DOCSTART-, RT, @KPRC2, :, WATCH, LIVE, :, ht...","[O, O, O, O, O, O, O, O, O, O]","[[CLS], Ġ-, DOC, ST, ART, -, ĠRT, Ġ@, K, PR, C...","[None, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 111, 46570, 4014, 11328, 12, 10541, 787, 5...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 4, ..."


# Building Dataloader

In [15]:
class NerDataset(Dataset):
    def __init__(self, df, device):
        self.data = df.to_dict(orient='records')
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]

    def collate_to_max_length(self, batch):
        max_seq_length = max([len(s['id']) for s in batch])
        batch = sorted(batch, key=lambda x: -len(x['id']))
        seq_length = torch.tensor([len(x['id']) for x in batch])
        input_ids = torch.tensor([x["id"] + [0] * (max_seq_length - len(x['id'])) for x in batch]).to(self.device)
        labels = torch.tensor([x["label_id"] + [-100] * (max_seq_length - len(x['label_id'])) for x in batch]).to(self.device)
        return {"id": input_ids, "label_id": labels, 'seq_length':seq_length, "sample":batch}


dataset_train = NerDataset(train_df, Config.device)

train_dataloader = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_train.collate_to_max_length)



dataset_valid = NerDataset(valid_df, Config.device)

valid_dataloader = DataLoader(dataset_valid,
                              sampler=RandomSampler(dataset_valid),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_valid.collate_to_max_length)


dataset_test = NerDataset(test_df, Config.device)

test_dataloader = DataLoader(dataset_test,
                              sampler=RandomSampler(dataset_test),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_test.collate_to_max_length)

# Building Custom loss functions

In [16]:
class L1_Loss:
    def __init__(self):
        self.l1_loss = nn.L1Loss()
    def loss(self, target, logit, label_num):

        target = target.view(-1)
        logit = logit.view(-1, label_num)

        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        target = F.one_hot(target, num_classes=label_num)
        return self.l1_loss(logit, target.float())


class L2_Loss:
    def __init__(self):
        self.mse_loss = nn.MSELoss()
    def loss(self, target, logit,label_num):
        target = target.view(-1)
        logit = logit.view(-1, label_num)

        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        target = F.one_hot(target, num_classes=label_num)
        loss = self.mse_loss(logit, target.float())
        return loss

class CE_Loss:
    def __init__(self):
        self.ce_loss = nn.CrossEntropyLoss(ignore_index=-100, reduce='mean')
    def loss(self, target, logit, label_num):
        return self.ce_loss(logit.reshape(-1, label_num), target.reshape(-1) )

class KLDivergenceLoss:
    def __init__(self):
        pass

    def loss(self, target, logit, label_num):
        target = target.view(-1)
        logit = logit.view(-1, label_num)

        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        probs = F.softmax(logit, dim=-1)

        # One-hot encode the targets to get true probabilities
        true_probs = F.one_hot(target, num_classes=label_num).float()

        mask_true_probs = true_probs > 0

        # Calculate g function for non-zero elements using the mask
        kl_values = torch.zeros_like(probs)
        kl_values[mask_true_probs] = true_probs[mask_true_probs] * torch.log(true_probs[mask_true_probs]/probs[mask_true_probs])

        # Sum over all classes and average over the batch size
        loss = kl_values.sum(dim=-1).mean()

        return loss

# DLITE Loss function
class DLITELoss:
    def __init__(self):
        super(DLITELoss, self).__init__()

    def loss(self, targets, logits, label_num, epsilon=1e-10):
        targets = targets.view(-1)
        logits = logits.view(-1, label_num)

        mask = targets.ne(-100).to(logits.device)
        logits = torch.masked_select(logits, mask.unsqueeze(-1).expand_as(logits)).reshape(-1, label_num)
        targets = torch.masked_select(targets, mask)

        # Convert logits to probabilities using softmax
        probs = F.softmax(logits, dim=-1)

        # One-hot encode the targets to get true probabilities
        true_probs = F.one_hot(targets, num_classes=probs.size(-1)).float()

        # Define the g function
        g_values = torch.abs(probs * (1 - torch.log(probs + epsilon)) - true_probs * (1 - torch.log(true_probs + epsilon)))

        # Define the delta_h function
        delta_h_values = torch.abs(probs**2 * (1 - 2 * torch.log(probs + epsilon)) - true_probs**2 * (1 - 2 * torch.log(true_probs + epsilon))) / (2 * (probs + true_probs))

        # Compute DLITE loss for each class
        dl_values = g_values - delta_h_values

        # Sum over all classes and average over batch size
        loss = dl_values.sum(dim=-1).mean()

        return loss

# Adding more Custom Function

# Adding Custom Layer

In [17]:
class LSTMEncoder(nn.Module):
    """lstm encoder
    """
    def __init__(self, config):
        super(LSTMEncoder, self).__init__()
        self.lstm = torch.nn.LSTM(config.hidden_size, config.hidden_size,
                                  num_layers=config.lstm_layer_num, bidirectional=config.bi_lstm,
                                  batch_first=True)

    def forward(self, hidden_state, seq_length):
        sequence_output = pack_padded_sequence(hidden_state, seq_length, batch_first=True)
        sequence_output, (h_n, c_n) = self.lstm(sequence_output)
        sequence_output, _ = pad_packed_sequence(sequence_output, batch_first=True)
        return sequence_output



class Ner_Model(nn.Module):
    def __init__(self,config, label_num, loss_name):
        super(Ner_Model, self).__init__()
        self.config = config
        # deberat model
        self.model = transformers.AutoModel.from_pretrained(config.model_name)
        # using custom layer
        if config.lstm_layer_num > 0:
            self.lstm = LSTMEncoder(config)


        self.label_num = label_num

        if config.bi_lstm and config.lstm_layer_num > 0:
            self.classifier = nn.Linear(config.hidden_size * 2 , label_num)
        else:
            self.classifier = nn.Linear(config.hidden_size  , label_num)

        if loss_name == 'ce':
            self.loss_func = CE_Loss()
        elif loss_name == 'l1':
            self.loss_func = L1_Loss()
        elif loss_name == 'l2':
            self.loss_func = L2_Loss()
        elif loss_name == 'kl':
            self.loss_func = KLDivergenceLoss()
        elif loss_name == 'dlite':
            self.loss_func = DLITELoss()
        else:
            assert 1==0

    def forward(self, input_ids, seq_length, attention_mask, labels):
        output = self.model(input_ids, attention_mask)
        sequence_output = output[0]
        if self.config.lstm_layer_num > 0:
            sequence_output = self.lstm(sequence_output, seq_length)
        logit = self.classifier(sequence_output)
        loss = self.loss_func.loss(labels, logit, len(label2id))
        return loss, logit

In [18]:
# Building Optimizer
def get_optimizer(model, config):
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    optimizer = AdamW(optimizer_grouped_parameters,
                      betas=(0.9, 0.98),
                      lr=config.lr)
    return optimizer

# Define the training functions

In [19]:
def evaluate(model, data_loader, mode="Validation"):
    ground_truth, predict = [], []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples = 0
    eval_preds, eval_labels = [], []
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            attention_mask = batch["id"].ne(0)
            targets = batch['label_id']
            loss, logit = model(batch["id"], batch['seq_length'], attention_mask=attention_mask,
                                             labels=targets)
            eval_loss += loss.cpu().item()
            if (step+1) % 100==0:
                loss_step = eval_loss / (step+1)
                print(f"{mode} loss per 100 evaluation steps: {loss_step}")

            # compute training accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = logit.view(-1, len(label2id)) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            active_accuracy = flattened_targets.ne(-100) # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

            eval_preds.extend(predictions.tolist())
            eval_labels.extend(targets.tolist())

    eval_loss = eval_loss / (step+1)
    eval_accuracy = eval_accuracy / (step+1)

    eval_labels,eval_preds = [id2label[i] for i in eval_labels], [id2label[i] for i in eval_preds]


    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(eval_labels, eval_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(eval_labels, eval_preds, average='macro')
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels, eval_preds,average='weighted')

    p_r_f1 = [[round(precision_micro,4), round(recall_micro,4), round(f1_micro,4)],
              [round(precision_macro,4), round(recall_macro,4), round(f1_macro,4)],
              [round(precision_weighted,4), round(recall_weighted,4), round(f1_weighted,4)]]

    p_r_f1 = pd.DataFrame(p_r_f1, columns=['precision', 'recall', 'f1'], index=['micro', 'macro', 'weighted'])

    print(f"{mode} Loss: {eval_loss}")
    print(f"{mode} Accuracy: {eval_accuracy}")

    p_r_f1_each_label = classification_report(eval_labels, eval_preds)
    print(f"{mode} P-R-F1 for each label: \n{p_r_f1_each_label}")
    print(f"{mode} P-R-F1 tor all label: \n{p_r_f1}")
    print(f"{mode} steps: {(step+1)}")
    return eval_loss, p_r_f1, p_r_f1_each_label

In [20]:
import warnings
warnings.filterwarnings('ignore')
loss_list = ['l1', 'l2', 'ce', 'kl', 'dlite']

# Running under 5 custom loss functions

In [21]:
from sklearn.metrics import accuracy_score
def train(config,loss_name):
    print("=" * 100)
    print(f"loss_name: {loss_name}")
    model = Ner_Model(config, len(label2id), loss_name).to(config.device)
    optimizer = get_optimizer(model, config)

    valid_each_label_p_r_f1_list = []
    valid_p_r_f1_list = []
    test_each_label_p_r_f1_list = []
    test_p_r_f1_list = []

    valid_loss_list = []
    test_loss_list = []

    model.train()
    interval = 100
    for epoch in range(config.epochs):
        print(f"Training epoch: {epoch + 1}")
        tr_preds,tr_labels = [], []
        total_loss = 0.0
        tr_accuracy = 0.0
        # print("\n" + "-" * 30 + "\n")
        # print(f"epoch: {epoch},  train dataloader size: {len(train_dataloader)}")
        # print(f"epoch: {epoch},  valid dataloader size: {len(valid_dataloader)}")
        # print(f"epoch: {epoch},  test dataloader size: {len(test_dataloader)}")
        for step, batch in enumerate(train_dataloader):
            attention_mask = batch["id"].ne(0)
            targets = batch['label_id']
            loss, logit= model(batch["id"], batch['seq_length'], attention_mask=attention_mask,
                                             labels=targets)

            # compute training accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = logit.view(-1, len(label2id)) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            active_accuracy = flattened_targets.ne(-100) # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            tr_accuracy += tmp_tr_accuracy
            tr_preds.extend(predictions)
            tr_labels.extend(targets)

            total_loss += loss.item()
            if (step + 1) % interval == 0:
                print(f"Training loss per 100 training steps: {total_loss / (step+1)}")

            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        print(f"Training loss epoch: {total_loss / (step+1)}")
        print(f"Training accuracy epoch: {tr_accuracy / (step+1)}")
        print(f"Training steps: {step+1}")
        print("\n\n")
        model.eval()


        valid_loss, valid_p_r_f1,  valid_each_label_p_r_f1 = evaluate(model,valid_dataloader, "Validation")
        valid_loss_list.append(valid_loss)
        valid_p_r_f1_list.append(valid_p_r_f1)
        valid_each_label_p_r_f1_list.append(valid_each_label_p_r_f1)

        print("\n\n")
        test_loss, test_p_r_f1,test_each_label_p_r_f1  = evaluate(model,test_dataloader, "Test")
        test_loss_list.append(test_loss)
        test_p_r_f1_list.append(test_p_r_f1)
        test_each_label_p_r_f1_list.append(test_each_label_p_r_f1)


        #print(f"epoch: {epoch}, train_loss: {train_loss}, \n{train_p_r_f1}")
        #print(f"epoch: {epoch}, valid_loss: {valid_loss}, \n{valid_p_r_f1}")
        #print(f"epoch: {epoch}, test_loss: {test_loss},  \n {test_p_r_f1}")
        model.train()
    return   {
              "valid_loss_list":valid_loss_list,
              "test_loss_list":test_loss_list,
              "valid_p_r_f1_list":valid_p_r_f1_list,
              "valid_each_label_p_r_f1_list":valid_each_label_p_r_f1_list,

              "test_p_r_f1_list":test_p_r_f1_list,
              "test_each_label_p_r_f1_list": test_each_label_p_r_f1_list}


result = {}
for loss_name in ['l1', 'l2', 'ce', 'kl', 'dlite']:
    r = train(Config, loss_name)
    result[loss_name] = r

loss_name: l1
Training epoch: 1
Training loss per 100 training steps: 0.14019331697374582
Training loss per 100 training steps: 0.09047539689578116
Training loss per 100 training steps: 0.07123233652363221
Training loss per 100 training steps: 0.061272875699214635
Training loss per 100 training steps: 0.05376829232275486
Training loss per 100 training steps: 0.048991816507962846
Training loss per 100 training steps: 0.04550060167243438
Training loss per 100 training steps: 0.042346934363013136
Training loss per 100 training steps: 0.03998903840159376
Training loss epoch: 0.03816235883841141
Training accuracy epoch: 0.9532297103797043
Training steps: 992



Validation loss per 100 evaluation steps: 0.01085593380499631
Validation loss per 100 evaluation steps: 0.010502557061845436
Validation loss per 100 evaluation steps: 0.010914422939531505
Validation Loss: 0.010800059584820746
Validation Accuracy: 0.9612090411365917
Validation P-R-F1 for each label: 
              precision    recall 

In [22]:
import pickle
with open("result.pkl", "wb") as f:
    pickle.dump(result, f)

# Result Comparison after cleaning

### Overall Result

In [23]:
columns = ['loss', 'precision', 'recall', 'f1']
for t in ['micro', 'macro', 'weighted']:
    df = []
    for loss_name in loss_list:
        row = {'loss': loss_name}
        row['precision'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'precision']
        row['recall'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'recall']
        row['f1'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'f1']
        df.append(row)
    print("="*100)
    print(t)
    print(pd.DataFrame(df))

micro
    loss  precision  recall      f1
0     l1     0.7659  0.7659  0.7659
1     l2     0.7659  0.7659  0.7659
2     ce     0.8442  0.8442  0.8442
3     kl     0.7693  0.7693  0.7693
4  dlite     0.7831  0.7831  0.7831
macro
    loss  precision  recall      f1
0     l1     0.0851  0.1111  0.0964
1     l2     0.1962  0.1111  0.0964
2     ce     0.2874  0.3402  0.2967
3     kl     0.1271  0.1368  0.1297
4  dlite     0.1251  0.1724  0.1434
weighted
    loss  precision  recall      f1
0     l1     0.5866  0.7659  0.6644
1     l2     0.6486  0.7659  0.6644
2     ce     0.7934  0.8442  0.8131
3     kl     0.6255  0.7693  0.6884
4  dlite     0.6485  0.7831  0.7086


### Each Label Result

In [24]:
print("test dataset")
for loss_name in loss_list:
    print("-"*50)
    print(loss_name)
    print(result[loss_name]['test_each_label_p_r_f1_list'][-1])

test dataset
--------------------------------------------------
l1
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00      1668
      B-MISC       0.00      0.00      0.00       702
       B-ORG       0.00      0.00      0.00      1661
       B-PER       0.00      0.00      0.00      1617
       I-LOC       0.00      0.00      0.00      1394
      I-MISC       0.00      0.00      0.00       736
       I-ORG       0.00      0.00      0.00      2804
       I-PER       0.00      0.00      0.00      3810
           O       0.77      1.00      0.87     47094

    accuracy                           0.77     61486
   macro avg       0.09      0.11      0.10     61486
weighted avg       0.59      0.77      0.66     61486

--------------------------------------------------
l2
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00      1668
      B-MISC       0.00      0.00      0.00       702
       B-OR