## Following the guide for BERT as practice
https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [None]:
suc3 = pd.read_pickle("../data/sucFrame")
suc3.head(10)

In [None]:
#Grab the categories the example use
data = suc3[['sentence_index', 'word_name', 'word_pos', 'word_type']]

# The example uses O instaed of Nan, so we follow them
data[['word_type']] = data[['word_type']].replace(np.nan,'O')
data.head(10)

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word_name"].values.tolist(),
                                                           s["word_pos"].values.tolist(),
                                                           s["word_type"].values.tolist())]
        self.grouped = self.data.groupby("sentence_index").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
#sentences[0]

In [None]:
sentences[0]

In [None]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

In [None]:
tags_vals = list(set(data["word_type"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [None]:
tag2idx

# Prepare Sentence and Labels

In [None]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
#from pytorch_pretrained_bert import BertTokenizer, BertConfig
#from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [None]:
from transformers import BertTokenizer, BertConfig(hidden_size=768/2, num_hidden_layers=6)
#from transformers import BertForTokenClassification, BertAdam
from transformers import BertForTokenClassification

In [11]:
#Control sequence length
MAX_LEN = 75

#batch size
bs = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [12]:
torch.cuda.get_device_name(0) 
from ipywidgets import IntProgress


In [13]:
#The Bert implementation comes with a pretrained tokenizer. This leverages general language understanding.
#And is better than rule-based approaches (add refs)
#Select the one most suited for your use case. Probably case-based swedish.

#TODO how is the pretraining carried out?
#TODO exists for swedish?

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [14]:
#Python list comprehension. Just tokenize each sentence and put the tokenized sentence in a lsit
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['i', 'sin', 'for', '##sta', 're', '##ak', '##tion', 'pa', 'so', '##v', '##jet', '##led', '##are', '##ns', 'var', '##ning', '##ar', 'de', '##kla', '##rera', '##de', 'lit', '##au', '##ens', 'president', 'v', '##yt', '##au', '##tas', 'lands', '##berg', '##is', 'at', '##t', '"', 'nu', 'av', '##vis', '##ar', 'go', '##rba', '##t', '##jo', '##v', 'var', 'ut', '##stra', '##ck', '##ta', 'hand', 'med', 'ex', '##tre', '##mt', 'ska', '##rp', '##a', 'och', 'ham', '##nd', '##ly', '##st', '##na', 'or', '##d', '"', '.']


In [15]:
#Each sentence is a list of words/tokens
#Replace each word/token by an id

tokens_to_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]

#If the number of tokens < maxlen, pad with 0
#If the number of tokens > maxlen, cut
#This ensures that all vectors have the same length
input_ids = pad_sequences(tokens_to_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [16]:
print(tokenized_texts[0])
print(input_ids[0])

['i', 'sin', 'for', '##sta', 're', '##ak', '##tion', 'pa', 'so', '##v', '##jet', '##led', '##are', '##ns', 'var', '##ning', '##ar', 'de', '##kla', '##rera', '##de', 'lit', '##au', '##ens', 'president', 'v', '##yt', '##au', '##tas', 'lands', '##berg', '##is', 'at', '##t', '"', 'nu', 'av', '##vis', '##ar', 'go', '##rba', '##t', '##jo', '##v', 'var', 'ut', '##stra', '##ck', '##ta', 'hand', 'med', 'ex', '##tre', '##mt', 'ska', '##rp', '##a', 'och', 'ham', '##nd', '##ly', '##st', '##na', 'or', '##d', '"', '.']
[ 1045  8254  2005  9153  2128  4817  3508  6643  2061  2615 15759  3709
 12069  3619 13075  5582  2906  2139 26086 24068  3207  5507  4887  6132
  2343  1058 22123  4887 10230  4915  4059  2483  2012  2102  1000 16371
 20704 11365  2906  2175 28483  2102  5558  2615 13075 21183 20528  3600
  2696  2192 19960  4654  7913 20492 24053 14536  2050 28166 10654  4859
  2135  3367  2532  2030  2094  1000  1012     0     0     0     0     0
     0     0     0]


In [17]:
#We have a corresponding list of tags, and do the same thing, converting names to numbers
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")


In [18]:
print(labels[0])
print(tags[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'PRS', 'O', 'O', 'O', 'O', 'PRS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[19 19 19 19 19 19 19 19  7 19 21 19 19 19 19 21 19 19 19 19 19 19 19 19
 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
 19 19 19]


In [19]:
#Attention masks ensures that padded elements are ignored in the sequences
#TODO how. why?
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [20]:
#Split to only use 10% of data
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [21]:
print(tr_inputs)
#Convert everythoing to torch tensors
#How does this affect things? TODO the matrices look the same

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
print(tr_inputs)

[[ 4479  1011  6925 ...     0     0     0]
 [ 2104  9807  1011 ... 24876  2063  4372]
 [28166 20014  2063 ...     0     0     0]
 ...
 [ 6229  8945  7520 ...     0     0     0]
 [ 6229  5199  5017 ...     0     0     0]
 [ 2061  2213  4372 ...     0     0     0]]
tensor([[ 4479,  1011,  6925,  ...,     0,     0,     0],
        [ 2104,  9807,  1011,  ..., 24876,  2063,  4372],
        [28166, 20014,  2063,  ...,     0,     0,     0],
        ...,
        [ 6229,  8945,  7520,  ...,     0,     0,     0],
        [ 6229,  5199,  5017,  ...,     0,     0,     0],
        [ 2061,  2213,  4372,  ...,     0,     0,     0]])


In [22]:
#Convert it all into a tensordataset 

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [23]:
# Wrap the brevious model with a token-level classifier. It's an additional linear layer that takes as input the last hidden state of the sequence

model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

In [24]:
model.cuda();

In [25]:
#This is just for deciding hyperparameters and such

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)


In [26]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [27]:

epochs = 5
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss, WHAT = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/5 [00:01<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.95 GiB total capacity; 2.94 GiB already allocated; 7.88 MiB free; 3.20 GiB reserved in total by PyTorch)

In [None]:
del.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))