## Following the guide for BERT as practice
https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
suc3 = pd.read_pickle("../data/sucFrame")
suc3.head(10)

Unnamed: 0,text_id,text_lix,text_nk,text_ovix,text_swefn,text_index,sentence__geocontext,sentence_id,sentence_index,word_blingbring,...,word_swefn,word_ex,word_name,word_subtype,word_type,word_index,word_tag,word_sentiment,word_sentimentclass,word__overlap
0,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,I,,,1,w,,,
1,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,sin,,,2,w,,,
2,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,första,,,3,w,,,
3,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,|gengäld|gensträvighet|hinder|motstånd|motverk...,...,,,reaktion,,,4,w,-0.247931,neutral,
4,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,på,,,5,w,,,
5,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,Sovjetledarens,,,6,w,,,
6,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,|anvisning|befallning|bestraffning|betänklighe...,...,|Warning|,,varningar,,,7,w,-0.5145,negative,
7,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,|bekräftelse|svar|,...,|Statement|,,deklarerade,,,8,w,,,
8,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,ENAMEX,Litauens,PPL,LOC,9,ne,,,
9,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,|ledare|myndighet|överordnad|,...,|Leadership|,,president,,,10,w,-0.0656,neutral,


In [3]:
#Grab the categories the example use
data = suc3[['sentence_index', 'word_name', 'word_pos', 'word_type']]

# The example uses O instaed of Nan, so we follow them
data[['word_type']] = data[['word_type']].replace(np.nan,'O')
data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,sentence_index,word_name,word_pos,word_type
0,1,I,PP,O
1,1,sin,PS,O
2,1,första,RO,O
3,1,reaktion,NN,O
4,1,på,PP,O
5,1,Sovjetledarens,NN,O
6,1,varningar,NN,O
7,1,deklarerade,VB,O
8,1,Litauens,,LOC
9,1,president,NN,O


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word_name"].values.tolist(),
                                                           s["word_pos"].values.tolist(),
                                                           s["word_type"].values.tolist())]
        self.grouped = self.data.groupby("sentence_index").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
#sentences[0]

In [5]:
sentences[0]

'I sin första reaktion på Sovjetledarens varningar deklarerade Litauens president Vytautas Landsbergis att " nu avvisar Gorbatjov vår utsträckta hand med extremt skarpa och hämndlystna ord " .'

In [6]:
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'PRS', 'O', 'O', 'O', 'O', 'PRS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [7]:
tags_vals = list(set(data["word_type"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [8]:
tag2idx

{'inst': 0,
 'animal': 1,
 'PRS/WRK': 2,
 'place': 3,
 'other': 4,
 'LOC/PRS': 5,
 'ORG/PRS': 6,
 'EVN': 7,
 'LOC': 8,
 'person': 9,
 'TME': 10,
 'LOC/LOC': 11,
 'myth': 12,
 'product': 13,
 'PRS': 14,
 'ORG': 15,
 'WRK': 16,
 'O': 17,
 'MSR': 18,
 'OBJ/ORG': 19,
 'work': 20,
 'OBJ': 21,
 'LOC/ORG': 22,
 'event': 23}

# Prepare Sentence and Labels

In [9]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
#from pytorch_pretrained_bert import BertTokenizer, BertConfig
#from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [10]:
from transformers import BertTokenizer, BertConfig
#from transformers import BertForTokenClassification, BertAdam
from transformers import BertForTokenClassification

In [11]:
#Control sequence length
MAX_LEN = 16

#batch size
bs = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [12]:
torch.cuda.get_device_name(0) 
from ipywidgets import IntProgress


In [13]:
#The Bert implementation comes with a pretrained tokenizer. This leverages general language understanding.
#And is better than rule-based approaches (add refs)
#Select the one most suited for your use case. Probably case-based swedish.

#TODO how is the pretraining carried out?
#TODO exists for swedish?

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [14]:
#Python list comprehension. Just tokenize each sentence and put the tokenized sentence in a lsit
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['i', 'sin', 'for', '##sta', 're', '##ak', '##tion', 'pa', 'so', '##v', '##jet', '##led', '##are', '##ns', 'var', '##ning', '##ar', 'de', '##kla', '##rera', '##de', 'lit', '##au', '##ens', 'president', 'v', '##yt', '##au', '##tas', 'lands', '##berg', '##is', 'at', '##t', '"', 'nu', 'av', '##vis', '##ar', 'go', '##rba', '##t', '##jo', '##v', 'var', 'ut', '##stra', '##ck', '##ta', 'hand', 'med', 'ex', '##tre', '##mt', 'ska', '##rp', '##a', 'och', 'ham', '##nd', '##ly', '##st', '##na', 'or', '##d', '"', '.']


In [15]:
#Each sentence is a list of words/tokens
#Replace each word/token by an id

tokens_to_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]

#If the number of tokens < maxlen, pad with 0
#If the number of tokens > maxlen, cut
#This ensures that all vectors have the same length
input_ids = pad_sequences(tokens_to_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [16]:
print(tokenized_texts[0])
print(input_ids[0])

['i', 'sin', 'for', '##sta', 're', '##ak', '##tion', 'pa', 'so', '##v', '##jet', '##led', '##are', '##ns', 'var', '##ning', '##ar', 'de', '##kla', '##rera', '##de', 'lit', '##au', '##ens', 'president', 'v', '##yt', '##au', '##tas', 'lands', '##berg', '##is', 'at', '##t', '"', 'nu', 'av', '##vis', '##ar', 'go', '##rba', '##t', '##jo', '##v', 'var', 'ut', '##stra', '##ck', '##ta', 'hand', 'med', 'ex', '##tre', '##mt', 'ska', '##rp', '##a', 'och', 'ham', '##nd', '##ly', '##st', '##na', 'or', '##d', '"', '.']
[ 1045  8254  2005  9153  2128  4817  3508  6643  2061  2615 15759  3709
 12069  3619 13075  5582]


In [17]:
#We have a corresponding list of tags, and do the same thing, converting names to numbers
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")


In [18]:
print(labels[0])
print(tags[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'PRS', 'O', 'O', 'O', 'O', 'PRS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[17 17 17 17 17 17 17 17  8 17 14 17 17 17 17 14]


In [19]:
#Attention masks ensures that padded elements are ignored in the sequences
#TODO how. why?
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [20]:
#Split to only use 10% of data
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [21]:
print(tr_inputs)
#Convert everythoing to torch tensors
#How does this affect things? TODO the matrices look the same

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
print(tr_inputs)

[[ 4479  1011  6925 ... 14383 26455  4215]
 [ 2104  9807  1011 ...  2389 19330  2912]
 [28166 20014  2063 ...  7412  2063  6655]
 ...
 [ 6229  8945  7520 ...  6392  8585 14482]
 [ 6229  5199  5017 ...  5558  8024 28166]
 [ 2061  2213  4372 ...  2099  1012     0]]
tensor([[ 4479,  1011,  6925,  ..., 14383, 26455,  4215],
        [ 2104,  9807,  1011,  ...,  2389, 19330,  2912],
        [28166, 20014,  2063,  ...,  7412,  2063,  6655],
        ...,
        [ 6229,  8945,  7520,  ...,  6392,  8585, 14482],
        [ 6229,  5199,  5017,  ...,  5558,  8024, 28166],
        [ 2061,  2213,  4372,  ...,  2099,  1012,     0]])


In [22]:
#Convert it all into a tensordataset 

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [23]:
# Wrap the previous model with a token-level classifier. It's an additional linear layer that takes as input the last hidden state of the sequence

model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

In [24]:
from transformers import BertModel, BertConfig

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

In [25]:
# Initializing a model from the bert-base-uncased style configuration
#model = BertModel(configuration)

# Accessing the model configuration
#configuration = model.config
#configuration

In [26]:
model.cuda();

In [27]:
#This is just for deciding hyperparameters and such

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)


In [28]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [29]:

epochs = 5
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss, WHAT = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
del.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

# Bert was too big, let's try another example with distilBERT

In [None]:
#https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [None]:
df.head()
batch_1 = df[:2000]

In [None]:
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

In [None]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Tokenize and process all sentences together as a batch
# That is, replace each unique word with a corresponding id to get a list of lists
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
#Pad to ensure the same length

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

np.array(padded).shape

In [None]:
#Now that input is all the same, tell the model to ignore the padding when it sees it.
#This is attention
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
#Convert to tensors
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

In [None]:
#The results of the processing ends up in last_hidden_states.
#TODO what does torch.no_grad() do?
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
#Let's have a look at the final output

"""
We get a 2000 x 59 x 768 tensor

The first dimension is the sentence
The second dimension is the word
The third dimension is the hidden state

last_hidden_states[0][:,0,:]
All rows (sentences), The first word(the prepended CLS token), all hidden states)
"""


last_hidden_states[0].shape

In [None]:
# Bert classifies sentences by generating a [CLS] (classification) token and prepending it to the output sentences.
# This token is an embedding for the entire sentence
features = last_hidden_states[0][:,0,:].numpy()
features.shape

In [None]:
#So we have an embedding for a sentence.
#And we have a classification for each sentence.
#Then we're back to familiar territory
labels = batch_1[1]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
#Let's just use basic logistic regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)
