In [115]:
import torch
import numpy as np
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel


In [116]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [117]:
df = pd.read_csv("../input/entity-annotated-corpus/ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)
dataset = df[['pos','sentence_idx','word','tag']]
dataset.head(50)
dataset = dataset.drop_duplicates()


b'Skipping line 281837: expected 25 fields, saw 34\n'


In [118]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["word"].values.tolist(),
                                                       s['pos'].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(dataset)

In [119]:
print(len(getter.sentences[:1000]))
considered_len = 1000

1000


In [120]:

tags_vals = list(set(dataset["tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
sentences = [' '.join([s[0] for s in sent]) for sent in getter.sentences]
labels = [[s[2] for s in sent] for sent in getter.sentences]
labels = [[tag2idx.get(l) for l in lab] for lab in labels]

In [121]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-05
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')

In [122]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = str(self.sentences[index])
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = self.labels[index]
        label.extend([4]*200)
        label=label[:200]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'tags': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [123]:
# Creating the dataset and dataloader for the neural network

train_percent = 0.8
train_size = int(train_percent*len(sentences))
# train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]

test_sentences = sentences[train_size:]
test_labels = labels[train_size:]

print("FULL Dataset: {}".format(len(sentences)))
print("TRAIN Dataset: {}".format(len(train_sentences)))
print("TEST Dataset: {}".format(len(test_sentences)))

training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)
testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN)

FULL Dataset: 35177
TRAIN Dataset: 28141
TEST Dataset: 7036


In [124]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [125]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=18)
        # self.l2 = torch.nn.Dropout(0.3)
        # self.l3 = torch.nn.Linear(768, 200)
    
    def forward(self, ids, mask, labels):
        output_1= self.l1(ids, mask, labels = labels)
        # output_2 = self.l2(output_1[0])
        # output = self.l3(output_2)
        return output_1

In [126]:
model = BERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForTokenClassification: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn

BERTClass(
  (l1): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
       

In [127]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [128]:
#from seqeval.metrics import f1_score
from sklearn.metrics import f1_score
def flat_accuracy(preds, labels):
    flat_preds = np.argmax(preds, axis=2).flatten()
    flat_labels = labels.flatten()
    return np.sum(flat_preds == flat_labels)/len(flat_labels)

In [129]:
def train_eval(epoch):
    model.train()
    train_loss = 0
    total_loss = 0
    train_accuracy = 0
    nb_train_steps, nb_train_examples = 0, 0
    predictions , true_labels = [], []
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['tags'].to(device, dtype = torch.long)

        #loss = model(ids, mask, labels = targets)[0]
        #total_loss +=loss.item()
        # Accuracy
        output = model(ids, mask, labels=targets)
        #print(output)
        loss = output[0]
        logits = output[1]
        logits = logits.detach().cpu().numpy()
        label_ids = targets.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        accuracy = flat_accuracy(logits, label_ids)
        train_loss += loss.mean().item()
        train_accuracy += accuracy
        nb_train_examples += ids.size(0)
        nb_train_steps += 1
        train_loss = train_loss/nb_train_steps
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"****** Epoch {epoch+1} ******")
    print(f"\nTraining loss: {train_loss}")
    print(f"Training Accuracy: {train_accuracy/nb_train_steps}")
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    actual_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Training F1-Score: {}".format(f1_score(actual_tags, pred_tags, average='weighted')))
    
    model.eval()
    eval_loss = 0; eval_accuracy = 0
    n_correct = 0; n_wrong = 0; total = 0
    predictions , true_labels = [], []
    nb_eval_steps, nb_eval_examples = 0, 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['tags'].to(device, dtype = torch.long)

            output = model(ids, mask, labels=targets)
            #print(output)
            #loss, logits = output[:2]
            loss = output[0]
            logits = output[1]
            logits = logits.detach().cpu().numpy()
            label_ids = targets.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)
            accuracy = flat_accuracy(logits, label_ids)
            eval_loss += loss.mean().item()
            eval_accuracy += accuracy
            nb_eval_examples += ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss/nb_eval_steps
        print("\nValidation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags,average='weighted')))

In [130]:
for epoch in range(2):
    train_eval(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


****** Epoch 1 ******

Training loss: 0.0007159462428962515
Training Accuracy: 0.6656405293924825
Training F1-Score: 0.7345428306115956





Validation loss: 0.567285154895349
Validation Accuracy: 0.8196647727272726
Validation F1-Score: 0.851524375992452




****** Epoch 2 ******

Training loss: 0.0006558471907217491
Training Accuracy: 0.7902754179414334
Training F1-Score: 0.8312478430310759





Validation loss: 0.37828629247166895
Validation Accuracy: 0.8554299242424243
Validation F1-Score: 0.8798236313123742


In [131]:
def valid(model, testing_loader):
    model.eval()
    eval_loss = 0; eval_accuracy = 0
    n_correct = 0; n_wrong = 0; total = 0
    predictions , true_labels = [], []
    nb_eval_steps, nb_eval_examples = 0, 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['tags'].to(device, dtype = torch.long)

            output = model(ids, mask, labels=targets)
            #print(output)
            #loss, logits = output[:2]
            loss = output[0]
            logits = output[1]
            logits = logits.detach().cpu().numpy()
            label_ids = targets.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)
            accuracy = flat_accuracy(logits, label_ids)
            eval_loss += loss.mean().item()
            eval_accuracy += accuracy
            nb_eval_examples += ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss/nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        print("F1-Score: {}".format(f1_score(pred_tags, valid_tags,average='weighted')))

In [132]:
# valid(model, testing_loader)

In [133]:
torch.save(model,"ner_bert.pt")

In [134]:
class Prediction():
    
    def __init__(self,tokenizer,model):
        self.device = "cpu"
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        
    def predict_(self,sentences):
#         tokenized_sentence = tokenizer.encode(sentences)
#         inputs = torch.tensor(tokenized_sentence).cuda()
        
        ''''''
        inputs = self.tokenizer.encode_plus(
            sentences,
            None,
            add_special_tokens=True,
            max_length=200,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = []
        label.extend([4]*200)
        label=label[:200]

        ids = torch.tensor(ids, dtype=torch.long)
        ids = torch.reshape(ids,(1,ids.shape[0]))
        print(ids.shape)
        mask = torch.tensor(mask, dtype=torch.long)
        mask = torch.reshape(mask,(1,mask.shape[0]))
        labels = torch.tensor(label, dtype=torch.long)
        labels = torch.reshape(labels,(1,labels.shape[0]))
        with torch.no_grad():
            output = self.model(ids,mask,labels)
        logits = output[1]
        logits = logits.detach().cpu().numpy()
        preds = list(p for p in np.argmax(logits, axis=2))
        labels = []
        for item in preds[0]:
            labels.append(tags_vals[item])
        
        
        tokens = self.tokenizer.convert_ids_to_tokens(ids.to('cpu').numpy()[0])
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens[1:], preds[0]):
            if token == "[SEP]":
                break
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tags_vals[label_idx])
                new_tokens.append(token)
        for token, label in zip(new_tokens, new_labels):
            print("{}\t{}".format(label, token))
        return labels

In [135]:
pred = Prediction(tokenizer,model)

In [136]:
test_sentences[0]

'President Yushchenko and Tymoshenko were once allies , but their relationship deteriorated because of bitter political infighting a year after the 2004 Orange Revolution that swept Mr. Yushchenko to power .'

In [137]:
res = pred.predict_(test_sentences[0])



torch.Size([1, 200])
B-per	president
I-per	yushchenko
O	and
O	tymoshenko
O	were
O	once
O	allies
O	,
O	but
O	their
O	relationship
O	deteriorated
O	because
O	of
O	bitter
O	political
O	infighting
O	a
O	year
O	after
O	the
O	2004
O	orange
O	revolution
O	that
O	swept
B-art	mr
O	.
B-art	yushchenko
B-art	to
B-art	power
B-art	.
