In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
import random ,json
import numpy as np


import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, RandomSampler, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torch.optim import AdamW
from tqdm import tqdm

## Setting Basic Parameters


In [None]:
class Config:
    batch_size = 4
    epochs = 1
    lr = 2e-5
    seed = 123


    # Can only have one True
    # only_use_standard_linear_layer=True, use linear layer
    only_use_standard_linear_layer = True
    # Setting the linear layer number
    linear_layer_num = 0
    linear_layer = [512, 512, 512, 512, 512, 512, 512, 512]


    # if only_use_standard_linear_layer is False, only_use_dropout is True, it means just use dropout.
    only_use_dropout = False
    dropout_prob = [0.05, 0, 0.05]


    # if only_use_standard_linear_layer is False, only_use_residual = True, just use resdiual
    only_use_residual = False
    # if only_use_standard_linear_layer is False, only_use_residual_and_dropout = True, use residual and dropout together.
    only_use_residual_and_dropout = False



    assert sum([1 if only_use_standard_linear_layer else 0,
                1 if only_use_dropout else 0,
                1 if only_use_residual else 0,
                1 if only_use_residual_and_dropout else 0]) == 1



    # if lstm layer is 0, then not using the lstm layer. Setting the lstm layer based on layers number required
    lstm_layer_num = 0
    bi_lstm=True


    # Internet resource; download from Internet
    # model_name = "microsoft/deberta-v3-base"

    model_name = "bert-base-uncased"
    #model_name = "deberta-base"

    hidden_size=768
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_data_name = "ner_datasetreference"

    @classmethod
    def describe(cls):
        parm = {"train_data_name": cls.train_data_name,
                "encoder_name": cls.model_name,
                "batch_size": cls.batch_size,
                "epochs": cls.epochs,
                "lr": cls.lr,
                "seed": cls.seed,
                "bi_lstm": cls.bi_lstm,
                "lstm_layer_num": cls.lstm_layer_num,
                "linear_layer": cls.linear_layer,
                "linear_layer_num":cls.linear_layer_num,
                "dropout_prob": cls.dropout_prob,
                "only_use_standard_linear_layer": cls.only_use_standard_linear_layer,
                "only_use_dropout": cls.only_use_dropout,
                "only_use_residual": cls.only_use_residual,
                "only_use_residual_and_dropout": cls.only_use_residual_and_dropout}
        return json.dumps(parm , ensure_ascii=False, indent=2)

random.seed(Config.seed)
np.random.seed(Config.seed)
torch.manual_seed(Config.seed)
torch.cuda.manual_seed_all(Config.seed)



## given configuration result

In [None]:
print(Config.describe())

{
  "train_data_name": "ner_datasetreference",
  "encoder_name": "bert-base-uncased",
  "batch_size": 4,
  "epochs": 1,
  "lr": 1e-05,
  "seed": 123,
  "bi_lstm": true,
  "lstm_layer_num": 0,
  "linear_layer": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "linear_layer_num": 0,
  "dropout_prob": [
    0.05,
    0,
    0.05
  ],
  "only_use_standard_linear_layer": true,
  "only_use_dropout": false,
  "only_use_residual": false,
  "only_use_residual_and_dropout": false
}


## Import dataset

In [None]:
def read_conll2003(file_path):
    data = []
    sample = []
    for idx, line in enumerate(open(file_path)):
        if idx == 0:
            continue
        line = line.strip()
        if line == "":
            if len(sample) != 0:
                data.append(sample)
            sample = []
        else:
            line = line.split()
            assert len(line) == 4
            sample.append([line[0], line[-1]])
    if len(sample) != 0:
        data.append(sample)
    data = [{"word": [i[0] for i in sample], "tag": [i[1] for i in sample]} for sample in data]
    return pd.DataFrame(data)

# Setting the chosen dataset, conll2003 or ner_datasetreference.
if Config.train_data_name == "conll2003":
    train_path = os.path.join(Config.train_data_name, 'train.txt')
    dev_path = os.path.join(Config.train_data_name, 'valid.txt')
    test_path = os.path.join(Config.train_data_name, 'test.txt')
    train_df = read_conll2003(train_path)
    valid_df = read_conll2003(dev_path)
    test_df = read_conll2003(test_path)
    print(train_df.shape, valid_df.shape, test_df.shape)
elif Config.train_data_name == "ner_datasetreference":
    df = pd.read_csv("ner_datasetreference.csv", encoding='iso-8859-1')
    data = []
    word, tag = [], []
    for i,j,k in zip(df['Sentence #'], df['Word'], df['Tag']):
        if not pd.isnull(i):
            assert i.startswith('Sentence')
            if len(word) > 0:
                data.append({"word":word, "tag":tag})
            word, tag = [], []
        if isinstance(j, str) and isinstance(k, str):
            # remove 'art', 'eve', 'nat' label for better macro results
            if any( t in k for t in ['art', 'eve', 'nat']):
                continue
            word.append(j)
            tag.append(k)
    if len(word) > 0:
        data.append({"word":word, "tag":tag})
        word, tag = [], []
    print(data[0], data[-1])
    df = pd.DataFrame(data)
    train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
    valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    print(df.shape, train_df.shape, valid_df.shape, test_df.shape)

{'word': ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], 'tag': ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']} {'word': ['Indian', 'forces', 'said', 'they', 'responded', 'to', 'the', 'attack'], 'tag': ['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
(47959, 2) (28775, 2) (9592, 2) (9592, 2)


In [None]:
train_df

Unnamed: 0,word,tag
28195,"[Egypt, agreed, two, years, ago, to, create, a...","[B-geo, O, B-tim, O, O, O, O, O, O, O, O, O, O..."
18202,"[The, new, tax, will, be, 33, percent, ,, up, ...","[O, O, O, O, O, B-tim, O, O, O, O, O, O, O]"
32420,"[At, least, one, person, was, killed, in, the,...","[O, O, O, O, O, O, O, O, B-gpe, O, O, B-geo, B..."
24529,"[Sri, Lankan, officials, say, eight, governmen...","[B-per, I-per, O, O, O, O, O, O, O, O, O, O, O..."
28005,"[Mr., Peres, said, the, statement, from, Presi...","[B-per, I-per, O, O, O, O, B-per, I-per, I-per..."
...,...,...
11284,"[However, ,, poverty, ,, illiteracy, ,, and, u...","[O, O, O, O, O, O, O, O, O, O, O, O]"
44732,"[The, report, was, published, two, days, after...","[O, O, O, O, O, O, O, O, O, O, B-org, I-org, O..."
38158,"[In, a, separate, incident, ,, kidnappers, rel...","[O, O, O, O, O, O, O, O, O, O, O, B-tim, O, O,..."
860,"[Those, groups, were, shut, down, by, U.S., of...","[O, O, O, O, O, O, B-org, O, O, O, B-tim, O]"


In [None]:
test_df

Unnamed: 0,word,tag
108,"[One, Afghan, soldier, was, killed, and, four,...","[O, B-gpe, O, O, O, O, O, O, O, O, O, B-gpe, O..."
15744,"[Indonesian, police, raided, a, suspected, mil...","[B-gpe, O, O, O, O, O, O, O, B-geo, O, O, B-ge..."
46327,"[No, government, soldiers, were, injured, .]","[O, O, O, O, O, O]"
22795,"[Suspected, Islamic, militants, fired, a, barr...","[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-..."
6753,"[Afghanistan, 's, election, commission, says, ...","[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
...,...,...
646,"[Earlier, Thursday, ,, Israeli, Prime, Ministe...","[O, B-tim, O, B-gpe, B-per, I-per, I-per, I-pe..."
9396,"[Russia, is, one, of, five, foreign, powers, w...","[B-geo, O, O, O, O, O, O, O, O, B-geo, I-geo, ..."
38920,"[Mr., Olmert, said, resuming, peace, talks, wi...","[B-per, I-per, O, O, O, O, O, O, O, B-geo, O, ..."
23748,"[On, Thursday, ,, at, least, 27, people, were,...","[O, B-tim, O, O, O, O, O, O, O, O, O, O, O, O,..."


In [None]:
valid_df

Unnamed: 0,word,tag
33705,"[The, APEC, officials, are, discussing, contin...","[O, B-org, O, O, O, O, O, O, O, O, O, O, O, O,..."
36195,"[The, local, deputy, police, chief, ,, Amanull...","[O, O, O, O, O, O, B-per, I-per, O, O, B-org, ..."
36377,"[The, French, news, agency, quotes, the, regio...","[O, B-gpe, O, O, O, O, O, O, O, O, O, O, O, O,..."
23094,"[Revenue, declined, 8, %, to, $, 85.7, million...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
16077,"[The, $, 500, billion, spending, package, woul...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
37165,"[The, president-elect, won, the, May, 10, auto...","[O, O, O, O, B-tim, I-tim, O, O, O, O, O, O, O..."
5173,"[Democrats, accuse, Mr., Bush, of, emphasizing...","[O, O, B-per, I-per, O, O, O, O, O, O, O, O, O..."
36524,"[Socialist, rule, was, brought, to, a, close, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
31122,"[A, border, war, with, Peru, that, flared, in,...","[O, O, O, O, B-geo, O, O, O, B-tim, O, O, O, B..."


In [None]:
def collect_label(df_list):
    ret = set()
    for df in df_list:
        for labels in df['tag']:
            for l in labels:
                if l == "O":
                    continue
                assert l.startswith("B-") or l.startswith("I-")
                ret.add(l[2:])
    return sorted(list(ret))

ner_category = collect_label([train_df, valid_df, test_df])
label_list = []
for l in ner_category:
    label_list.append("B-" + l)
    label_list.append("I-" + l)
label_list = ['O'] + label_list
label2id = dict([(v, idx) for idx, v in enumerate(label_list)])
id2label = dict([(idx, v) for idx, v in enumerate(label_list)])
print(f"ner category {ner_category} .\n\nlabel list {label_list} .\n\nlabel2id {label2id} .\n\nid2label {id2label}\n\n")
label_list = label_list

ner category ['geo', 'gpe', 'org', 'per', 'tim'] .

label list ['O', 'B-geo', 'I-geo', 'B-gpe', 'I-gpe', 'B-org', 'I-org', 'B-per', 'I-per', 'B-tim', 'I-tim'] .

label2id {'O': 0, 'B-geo': 1, 'I-geo': 2, 'B-gpe': 3, 'I-gpe': 4, 'B-org': 5, 'I-org': 6, 'B-per': 7, 'I-per': 8, 'B-tim': 9, 'I-tim': 10} .

id2label {0: 'O', 1: 'B-geo', 2: 'I-geo', 3: 'B-gpe', 4: 'I-gpe', 5: 'B-org', 6: 'I-org', 7: 'B-per', 8: 'I-per', 9: 'B-tim', 10: 'I-tim'}




## Import Reberta Model

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.model_name, add_prefix_space=True)
print(tokenizer.is_fast)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

True




## tokenize and build Label

In [None]:
def align(tag, word_ids):
    aligned_tag = []
    i = 0
    while i < len(word_ids):
        if word_ids[i] is None:
            aligned_tag.append(None)
            i += 1
        elif tag[word_ids[i]] == "O":
            aligned_tag.append(tag[word_ids[i]])
            i += 1
        elif tag[word_ids[i]].startswith("B-"):
            n = 0
            while (i+n) < len(word_ids) and word_ids[i]  == word_ids[i+n]:
                n += 1
            aligned_tag.append(tag[word_ids[i]])
            if n > 1:
                aligned_tag.extend(["I-" + tag[word_ids[i]][2:] ] * (n-1))
            i = i + n
        else:
            aligned_tag.append(tag[word_ids[i]])
            i += 1
    return aligned_tag


In [None]:
#words = train_df.iloc[2]["word"]
#tag = train_df.iloc[2]["label"]
words = ['I', '1996-08-22', '1996-08-22', 'I']
tag = ["O", "B-LOC", "B-ORG", "O"]
print(words, tag)
s = tokenizer(words, truncation=True, is_split_into_words=True)
word_ids = s.word_ids()
# align tokens and words
tokens = tokenizer.convert_ids_to_tokens(s['input_ids'])
tags = align(tag, s.word_ids())
print(pd.DataFrame(list(zip(tokens, tags, word_ids)), columns=['tokens', 'tags', 'word-index']))


['I', '1996-08-22', '1996-08-22', 'I'] ['O', 'B-LOC', 'B-ORG', 'O']
   tokens   tags  word-index
0   [CLS]   None         NaN
1       i      O         0.0
2    1996  B-LOC         1.0
3       -  I-LOC         1.0
4      08  I-LOC         1.0
5       -  I-LOC         1.0
6      22  I-LOC         1.0
7    1996  B-ORG         2.0
8       -  I-ORG         2.0
9      08  I-ORG         2.0
10      -  I-ORG         2.0
11     22  I-ORG         2.0
12      i      O         3.0
13  [SEP]   None         NaN


In [None]:
def preprocess(x):
    word = x['word']
    r = tokenizer(word, truncation=True, is_split_into_words=True)
    word_ids = r.word_ids()
    tokens = tokenizer.convert_ids_to_tokens(r['input_ids'])
    align_label = align(x['tag'], word_ids)
    return tokens, align_label, r['input_ids'], [label2id[i] if i is not None else -100  for i in align_label], word_ids


In [None]:
train_df[['token', 'label', 'id', 'label_id', 'word_ids']] = train_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)
valid_df[['token', 'label', 'id', 'label_id', 'word_ids']] = valid_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)
test_df[['token', 'label', 'id', 'label_id', 'word_ids']] = test_df.apply(lambda x: pd.Series(preprocess(x)), axis=1)

In [None]:
test_df

Unnamed: 0,word,tag,token,label,id,label_id,word_ids
108,"[One, Afghan, soldier, was, killed, and, four,...","[O, B-gpe, O, O, O, O, O, O, O, O, O, B-gpe, O...","[[CLS], one, afghan, soldier, was, killed, and...","[None, O, B-gpe, O, O, O, O, O, O, O, O, O, B-...","[101, 2028, 12632, 5268, 2001, 2730, 1998, 217...","[-100, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1..."
15744,"[Indonesian, police, raided, a, suspected, mil...","[B-gpe, O, O, O, O, O, O, O, B-geo, O, O, B-ge...","[[CLS], indonesian, police, raided, a, suspect...","[None, B-gpe, O, O, O, O, O, O, O, B-geo, I-ge...","[101, 9003, 2610, 18784, 1037, 6878, 16830, 29...","[-100, 3, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11..."
46327,"[No, government, soldiers, were, injured, .]","[O, O, O, O, O, O]","[[CLS], no, government, soldiers, were, injure...","[None, O, O, O, O, O, O, None]","[101, 2053, 2231, 3548, 2020, 5229, 1012, 102]","[-100, 0, 0, 0, 0, 0, 0, -100]","[None, 0, 1, 2, 3, 4, 5, None]"
22795,"[Suspected, Islamic, militants, fired, a, barr...","[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[[CLS], suspected, islamic, militants, fired, ...","[None, O, O, O, O, O, O, O, O, O, O, O, O, B-g...","[101, 6878, 5499, 17671, 5045, 1037, 19359, 19...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1..."
6753,"[Afghanistan, 's, election, commission, says, ...","[B-geo, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[[CLS], afghanistan, ', s, election, commissio...","[None, B-geo, O, O, O, O, O, O, O, O, O, O, O,...","[101, 7041, 1005, 1055, 2602, 3222, 2758, 2009...","[-100, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11..."
...,...,...,...,...,...,...,...
646,"[Earlier, Thursday, ,, Israeli, Prime, Ministe...","[O, B-tim, O, B-gpe, B-per, I-per, I-per, I-pe...","[[CLS], earlier, thursday, ,, israeli, prime, ...","[None, O, B-tim, O, B-gpe, B-per, I-per, I-per...","[101, 3041, 9432, 1010, 5611, 3539, 2704, 1612...","[-100, 0, 9, 0, 3, 7, 8, 8, 8, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1..."
9396,"[Russia, is, one, of, five, foreign, powers, w...","[B-geo, O, O, O, O, O, O, O, O, B-geo, I-geo, ...","[[CLS], russia, is, one, of, five, foreign, po...","[None, B-geo, O, O, O, O, O, O, O, O, B-geo, I...","[101, 3607, 2003, 2028, 1997, 2274, 3097, 4204...","[-100, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1..."
38920,"[Mr., Olmert, said, resuming, peace, talks, wi...","[B-per, I-per, O, O, O, O, O, O, O, B-geo, O, ...","[[CLS], mr, ., ol, ##mer, ##t, said, res, ##um...","[None, B-per, I-per, I-per, I-per, I-per, O, O...","[101, 2720, 1012, 19330, 5017, 2102, 2056, 245...","[-100, 7, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 0, 1, 1, 1, 2, 3, 3, 4, 5, 6, 7, 8, ..."
23748,"[On, Thursday, ,, at, least, 27, people, were,...","[O, B-tim, O, O, O, O, O, O, O, O, O, O, O, O,...","[[CLS], on, thursday, ,, at, least, 27, people...","[None, O, B-tim, O, O, O, O, O, O, O, O, O, O,...","[101, 2006, 9432, 1010, 2012, 2560, 2676, 2111...","[-100, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1..."


## Building Dataloader

In [None]:

class NerDataset(Dataset):
    def __init__(self, df, device):
        self.data = df.to_dict(orient='records')
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]

    def collate_to_max_length(self, batch):
        max_seq_length = max([len(s['id']) for s in batch])
        batch = sorted(batch, key=lambda x: -len(x['id']))
        seq_length = torch.tensor([len(x['id']) for x in batch])
        input_ids = torch.tensor([x["id"] + [0] * (max_seq_length - len(x['id'])) for x in batch]).to(self.device)
        labels = torch.tensor([x["label_id"] + [-100] * (max_seq_length - len(x['label_id'])) for x in batch]).to(self.device)
        return {"id": input_ids, "label_id": labels, 'seq_length':seq_length, "sample":batch}


dataset_train = NerDataset(train_df, Config.device)

train_dataloader = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_train.collate_to_max_length)



dataset_valid = NerDataset(valid_df, Config.device)

valid_dataloader = DataLoader(dataset_valid,
                              sampler=RandomSampler(dataset_valid),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_valid.collate_to_max_length)


dataset_test = NerDataset(test_df, Config.device)

test_dataloader = DataLoader(dataset_test,
                              sampler=RandomSampler(dataset_test),
                              batch_size=Config.batch_size,
                              drop_last=False,
                              collate_fn=dataset_test.collate_to_max_length)

## Building Custom loss functions

In [None]:

class L1_Loss:
    def __init__(self):
        self.l1_loss = nn.L1Loss()
    def loss(self, target, logit, label_num):

        target = target.view(-1)
        logit = logit.view(-1, label_num)
        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        target = F.one_hot(target, num_classes=label_num)
        return self.l1_loss(logit, target.float())


class L2_Loss:
    def __init__(self):
        self.mse_loss = nn.MSELoss()
    def loss(self, target, logit,label_num):
        target = target.view(-1)
        logit = logit.view(-1, label_num)
        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        target = F.one_hot(target, num_classes=label_num)
        loss = self.mse_loss(logit, target.float())
        return loss

class CE_Loss:
    def __init__(self):
        self.ce_loss = nn.CrossEntropyLoss(ignore_index=-100, reduce='mean')
    def loss(self, target, logit, label_num):
        return self.ce_loss(logit.reshape(-1, label_num), target.reshape(-1) )

class KLDivergenceLoss:
    def __init__(self):
        pass

    def loss(self, target, logit, label_num):
        target = target.view(-1)
        logit = logit.view(-1, label_num)

        mask = target.ne(-100).to(logit.device)
        logit = torch.masked_select(logit, mask.unsqueeze(-1).expand_as(logit)).reshape(-1, label_num)
        target = torch.masked_select(target, mask)

        probs = F.softmax(logit, dim=-1)

        # One-hot encode the targets to get true probabilities
        true_probs = F.one_hot(target, num_classes=label_num).float()

        mask_true_probs = true_probs > 0

        # Calculate g function for non-zero elements using the mask
        kl_values = torch.zeros_like(probs)
        kl_values[mask_true_probs] = true_probs[mask_true_probs] * torch.log(true_probs[mask_true_probs]/probs[mask_true_probs])

        # Sum over all classes and average over the batch size
        loss = kl_values.sum(dim=-1).mean()

        return loss

# DLITE Loss function
class DLITELoss:
    def __init__(self):
        super(DLITELoss, self).__init__()

    def loss(self, targets, logits, label_num, epsilon=1e-10):
        targets = targets.view(-1)
        logits = logits.view(-1, label_num)

        mask = targets.ne(-100).to(logits.device)
        logits = torch.masked_select(logits, mask.unsqueeze(-1).expand_as(logits)).reshape(-1, label_num)
        targets = torch.masked_select(targets, mask)

        # Convert logits to probabilities using softmax
        probs = F.softmax(logits, dim=-1)

        # One-hot encode the targets to get true probabilities
        true_probs = F.one_hot(targets, num_classes=probs.size(-1)).float()

        # Define the g function
        g_values = torch.abs(probs * (1 - torch.log(probs + epsilon)) - true_probs * (1 - torch.log(true_probs + epsilon)))

        # Define the delta_h function
        delta_h_values = torch.abs(probs**2 * (1 - 2 * torch.log(probs + epsilon)) - true_probs**2 * (1 - 2 * torch.log(true_probs + epsilon))) / (2 * (probs + true_probs))

        # Compute DLITE loss for each class
        dl_values = g_values - delta_h_values

        # Sum over all classes and average over batch size
        loss = dl_values.sum(dim=-1).mean()

        return loss



## Adding Custom Layer

In [None]:

class LSTMEncoder(nn.Module):
    """lstm encoder
    """
    def __init__(self, config, hidden_size):
        super(LSTMEncoder, self).__init__()
        self.lstm = torch.nn.LSTM(hidden_size, hidden_size,
                                  num_layers=config.lstm_layer_num, bidirectional=config.bi_lstm,
                                  batch_first=True)

    def forward(self, hidden_state, seq_length):
        sequence_output = pack_padded_sequence(hidden_state, seq_length, batch_first=True)
        sequence_output, (h_n, c_n) = self.lstm(sequence_output)
        sequence_output, _ = pad_packed_sequence(sequence_output, batch_first=True)
        return sequence_output







class LinearResidualLayer(nn.Module):
    def __init__(self, config, hidden_size, output_dim):
        super(LinearResidualLayer, self).__init__()
        self.config = config
        self.linear_layer1 = nn.Linear(in_features=hidden_size, out_features=output_dim)
        self.linear_layer2 = nn.Linear(in_features=output_dim, out_features=output_dim)
        self.linear_layer3 = nn.Linear(in_features=output_dim, out_features=output_dim)
        self.act_func = nn.ReLU()
        if not self.config.only_use_residual:
            self.dropout1 = nn.Dropout(config.dropout_prob[0])
        self.ln_1 = nn.LayerNorm(output_dim)
        if not self.config.only_use_residual:
            self.dropout2 = nn.Dropout(config.dropout_prob[1])



    def forward(self, x):
        # x = self.act_func(self.linear_layer1(x))
        # x = self.ln_1(x)

        # y = self.dropout1(x) + self.dropout1(self.act_func(self.linear_layer2(x) )  )
        # y = self.ln_2(y)
        # z = self.dropout2(x) + self.dropout1(y) + self.dropout2(self.act_func(self.linear_layer3(y) )  )

        x = self.act_func(self.linear_layer1(x))
        x = self.ln_1(x)

        if self.config.only_use_residual:
            y = x + self.act_func(self.linear_layer2(x) )
            z = x + y + self.act_func(self.linear_layer3(y) )
        elif self.config.only_use_residual_and_dropout:
            y = self.dropout1(x) + self.act_func(self.linear_layer2(x) )
            z = self.dropout2(x) + self.dropout1(y) + self.act_func(self.linear_layer3(y) )
        else:
            assert ValueError("config error")
        return z




class Ner_Model(nn.Module):
    def __init__(self,config, label_num, loss_name):
        super(Ner_Model, self).__init__()
        self.config = config
        # deberat model
        self.model = transformers.AutoModel.from_pretrained(config.model_name)

        hidden_size = config.hidden_size

        # using linear layer
        linear_layer = []
        if self.config.linear_layer_num  > 0:
            # using linear layer
            if self.config.only_use_standard_linear_layer:
                for out_dim in config.linear_layer[0:config.linear_layer_num]:
                    linear_layer.append( nn.Linear(in_features=hidden_size, out_features=out_dim) )
                    linear_layer.append( nn.ReLU() )
                    hidden_size = out_dim
                self.linear_model = nn.Sequential(*linear_layer)
            # just use dropout
            elif self.config.only_use_dropout:
                for i, out_dim in enumerate(config.linear_layer[0:config.linear_layer_num]):
                    linear_layer.append( nn.Linear(in_features=hidden_size, out_features=out_dim) )
                    linear_layer.append( nn.ReLU() )
                    linear_layer.append( nn.Dropout(config.dropout_prob[i]) )
                    hidden_size = out_dim
                self.linear_model = nn.Sequential(*linear_layer)
            else:
                # use 3 linear layer for skip 2 dropout
                assert config.linear_layer[0] == config.linear_layer[1] == config.linear_layer[2]
                assert len(config.dropout_prob) == 2
                self.linear_model = LinearResidualLayer(config, hidden_size,config.linear_layer[0])
                hidden_size = config.linear_layer[0]

        # whether to use lstm layer
        if config.lstm_layer_num > 0:
            self.lstm = LSTMEncoder(config,hidden_size)

        # identify label number
        self.label_num = label_num

        # whether to use bi-lstm layer
        if config.bi_lstm and config.lstm_layer_num > 0:
            hidden_size = hidden_size * 2

        self.classifier = nn.Linear(hidden_size, label_num)

        if loss_name == 'ce':
            self.loss_func = CE_Loss()
        elif loss_name == 'l1':
            self.loss_func = L1_Loss()
        elif loss_name == 'l2':
            self.loss_func = L2_Loss()
        elif loss_name == 'kl':
            self.loss_func = KLDivergenceLoss()
        elif loss_name == 'dlite':
            self.loss_func = DLITELoss()
        else:
            assert 1==0

        print("model configuration")
        print("%" * 20)
        print(self)
        print("%" * 20)

    def forward(self, input_ids, seq_length, attention_mask, labels):
        output = self.model(input_ids, attention_mask)
        sequence_output = output[0]
        if self.config.linear_layer_num > 0:
            sequence_output = self.linear_model(sequence_output)

        if self.config.lstm_layer_num > 0:
            sequence_output = self.lstm(sequence_output, seq_length)

        logit = self.classifier(sequence_output)
        loss = self.loss_func.loss(labels, logit, len(label2id))
        return loss, logit

In [None]:
# Building optimizer
def get_optimizer(model, config):
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    optimizer = AdamW(optimizer_grouped_parameters,
                      betas=(0.9, 0.98),
                      lr=config.lr)
    return optimizer


## Defining the training function

In [None]:

def evaluate(model, data_loader, mode="Validation"):
    ground_truth, predict = [], []
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples = 0
    eval_preds, eval_labels = [], []
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            attention_mask = batch["id"].ne(0)
            targets = batch['label_id']
            loss, logit = model(batch["id"], batch['seq_length'], attention_mask=attention_mask,
                                             labels=targets)
            eval_loss += loss.cpu().item()
            if (step+1) % 100==0:
                loss_step = eval_loss / (step+1)
                print(f"{mode} loss per 100 evaluation steps: {loss_step}")

            # compute training accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = logit.view(-1, len(label2id)) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            active_accuracy = flattened_targets.ne(-100) # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

            eval_preds.extend(predictions.tolist())
            eval_labels.extend(targets.tolist())

    eval_loss = eval_loss / (step+1)
    eval_accuracy = eval_accuracy / (step+1)
    eval_labels,eval_preds = [id2label[i] for i in eval_labels], [id2label[i] for i in eval_preds]


    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(eval_labels, eval_preds, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(eval_labels, eval_preds, average='macro')
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels, eval_preds,average='weighted')

    p_r_f1 = [[round(precision_micro,4), round(recall_micro,4), round(f1_micro,4)],
              [round(precision_macro,4), round(recall_macro,4), round(f1_macro,4)],
              [round(precision_weighted,4), round(recall_weighted,4), round(f1_weighted,4)]]

    p_r_f1 = pd.DataFrame(p_r_f1, columns=['precision', 'recall', 'f1'], index=['micro', 'macro', 'weighted'])

    print(f"{mode} Loss: {eval_loss}")
    print(f"{mode} Accuracy: {eval_accuracy}")

    p_r_f1_each_label = classification_report(eval_labels, eval_preds)
    print(f"{mode} P-R-F1 for each label: \n{p_r_f1_each_label}")
    print(f"{mode} P-R-F1 tor all label: \n{p_r_f1}")
    print(f"{mode} steps: {(step+1)}")
    return eval_loss, p_r_f1, p_r_f1_each_label



## Running under 5 custom loss functions

In [None]:
import warnings
warnings.filterwarnings('ignore')
loss_list = ['l1', 'l2', 'ce', 'kl', 'dlite']

In [None]:
from sklearn.metrics import accuracy_score
def train(config,loss_name):
    print("=" * 100)
    print(f"loss_name: {loss_name}")
    model = Ner_Model(config, len(label2id), loss_name).to(config.device)
    optimizer = get_optimizer(model, config)

    valid_each_label_p_r_f1_list = []
    valid_p_r_f1_list = []
    test_each_label_p_r_f1_list = []
    test_p_r_f1_list = []

    valid_loss_list = []
    test_loss_list = []

    model.train()
    interval = 100
    for epoch in range(config.epochs):
        print(f"Training epoch: {epoch + 1}")
        tr_preds,tr_labels = [], []
        total_loss = 0.0
        tr_accuracy = 0.0
        # print("\n" + "-" * 30 + "\n")
        # print(f"epoch: {epoch},  train dataloader size: {len(train_dataloader)}")
        # print(f"epoch: {epoch},  valid dataloader size: {len(valid_dataloader)}")
        # print(f"epoch: {epoch},  test dataloader size: {len(test_dataloader)}")
        for step, batch in enumerate(train_dataloader):
            attention_mask = batch["id"].ne(0)
            targets = batch['label_id']
            loss, logit= model(batch["id"], batch['seq_length'], attention_mask=attention_mask,
                                             labels=targets)

            # compute training accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = logit.view(-1, len(label2id)) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            active_accuracy = flattened_targets.ne(-100) # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            tr_accuracy += tmp_tr_accuracy
            tr_preds.extend(predictions)
            tr_labels.extend(targets)

            total_loss += loss.item()
            if (step + 1) % interval == 0:
                print(f"Training loss per 100 training steps: {total_loss / (step+1)}")

            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        print(f"Training loss epoch: {total_loss / (step+1)}")
        print(f"Training accuracy epoch: {tr_accuracy / (step+1)}")
        print(f"Training steps: {step+1}")
        print("\n\n")
        model.eval()


        valid_loss, valid_p_r_f1,  valid_each_label_p_r_f1 = evaluate(model,valid_dataloader, "Validation")
        valid_loss_list.append(valid_loss)
        valid_p_r_f1_list.append(valid_p_r_f1)
        valid_each_label_p_r_f1_list.append(valid_each_label_p_r_f1)

        print("\n\n")
        test_loss, test_p_r_f1,test_each_label_p_r_f1  = evaluate(model,test_dataloader, "Test")
        test_loss_list.append(test_loss)
        test_p_r_f1_list.append(test_p_r_f1)
        test_each_label_p_r_f1_list.append(test_each_label_p_r_f1)


        #print(f"epoch: {epoch}, train_loss: {train_loss}, \n{train_p_r_f1}")
        #print(f"epoch: {epoch}, valid_loss: {valid_loss}, \n{valid_p_r_f1}")
        #print(f"epoch: {epoch}, test_loss: {test_loss},  \n {test_p_r_f1}")
        model.train()
    return   {
              "valid_loss_list":valid_loss_list,
              "test_loss_list":test_loss_list,

              "valid_p_r_f1_list":valid_p_r_f1_list,
              "valid_each_label_p_r_f1_list":valid_each_label_p_r_f1_list,

              "test_p_r_f1_list":test_p_r_f1_list,
              "test_each_label_p_r_f1_list": test_each_label_p_r_f1_list}


result = {}
for loss_name in ['l1', 'l2', 'ce', 'kl', 'dlite']:
    r = train(Config, loss_name)
    result[loss_name] = r

loss_name: l1


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

model configuration
%%%%%%%%%%%%%%%%%%%%
Ner_Model(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

## Result Comparison after cleaning

In [None]:
import pickle
with open("result.pkl", "wb") as f:
    pickle.dump(result, f)

#### Overall Result

In [None]:
columns = ['loss', 'precision', 'recall', 'f1']
for t in ['micro', 'macro', 'weighted']:
    df = []
    for loss_name in loss_list:
        row = {'loss': loss_name}
        row['precision'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'precision']
        row['recall'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'recall']
        row['f1'] = result[loss_name]['test_p_r_f1_list'][-1].loc[t, 'f1']
        df.append(row)
    print("="*100)
    print(t)
    print(pd.DataFrame(df))

micro
    loss  precision  recall      f1
0     l1     0.9377  0.9377  0.9377
1     l2     0.9572  0.9572  0.9572
2     ce     0.9590  0.9590  0.9590
3     kl     0.9585  0.9585  0.9585
4  dlite     0.9557  0.9557  0.9557
macro
    loss  precision  recall      f1
0     l1     0.7105  0.6362  0.6261
1     l2     0.8340  0.8156  0.8230
2     ce     0.8603  0.8086  0.8312
3     kl     0.8505  0.8166  0.8308
4  dlite     0.8376  0.7789  0.8024
weighted
    loss  precision  recall      f1
0     l1     0.9365  0.9377  0.9317
1     l2     0.9570  0.9572  0.9567
2     ce     0.9579  0.9590  0.9579
3     kl     0.9575  0.9585  0.9575
4  dlite     0.9546  0.9557  0.9548


#### Each label Result

In [None]:

print("test dataset")
for loss_name in loss_list:
    print("-"*50)
    print(loss_name)
    print(result[loss_name]['test_each_label_p_r_f1_list'][-1])


test dataset
--------------------------------------------------
l1
              precision    recall  f1-score   support

       B-geo       0.77      0.92      0.84      7493
       B-gpe       0.86      0.89      0.88      3224
       B-org       0.79      0.30      0.43      3968
       B-per       0.73      0.82      0.77      3433
       B-tim       0.75      0.76      0.76      4121
       I-geo       0.69      0.82      0.75      5542
       I-gpe       0.00      0.00      0.00       320
       I-org       0.58      0.51      0.55      6599
       I-per       0.79      0.96      0.87      8347
       I-tim       0.85      0.03      0.06      1720
           O       0.98      0.99      0.98    189671

    accuracy                           0.94    234438
   macro avg       0.71      0.64      0.63    234438
weighted avg       0.94      0.94      0.93    234438

--------------------------------------------------
l2
              precision    recall  f1-score   support

       B-ge