In [14]:
import os
from nltk.tokenize import sent_tokenize
import pickle

import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

from seqeval.metrics import f1_score
import json

In [15]:
with open('biogen_xstopwordsx_latest.pkl', 'rb') as f:
    corpus = pickle.load(f)
sentences = corpus[0]
labels = corpus[1]

In [16]:
for i in range(len(sentences)):
    sentences[i] = ' '.join(sentences[i])
    sentences[i] = sentences[i][:-1]

## Model

In [17]:
MAX_LEN = 75
bs = 32

In [18]:
tag2idx = {'B': 0, 'I': 1, 'O': 2}
tags_vals = ['B', 'I', 'O']

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [21]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [22]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [23]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [24]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [25]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [26]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [27]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2idx))

In [28]:
model = model.cuda()

In [29]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [30]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [31]:
epochs = 6
max_grad_norm = 1.0

count = 0
for _ in trange(epochs, desc="Epoch"):
    count += 1
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
#     pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
#     valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]

    pred_tags = []
    for pred_set in predictions:
        pred_tags.append([tags_vals[p_i] for p_i in pred_set])       
    
    valid_tags = []
    for page in true_labels:
        for sent in page:
            valid_tags.append([tags_vals[v_i] for v_i in list(sent)])

    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    
    torch.save(model, 'epoch_{}_model.pt'.format(count))
    
    temp_eval_score = [
    "Validation loss: {}".format(eval_loss/nb_eval_steps),
    "Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps),
    "Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))
    ]
    with open('epoch_{}_model_eval_metrics.txt'.format(count), 'w') as filehandle:
        json.dump(temp_eval_score, filehandle)
    

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.80 GiB total capacity; 926.99 MiB already allocated; 28.31 MiB free; 1.02 GiB reserved in total by PyTorch)

In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])

    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

# pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
# valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]

    pred_tags = []
    for pred_set in predictions:
        pred_tags.append([tags_vals[p_i] for p_i in pred_set])       
    
    valid_tags = []
    for page in true_labels:
        for sent in page:
            valid_tags.append([tags_vals[v_i] for v_i in list(sent)])

print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
torch.save(model, 'best_model.pt')

In [None]:
eval_scores = [
    "Validation loss: {}".format(eval_loss/nb_eval_steps),
    "Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps),
    "Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags))
    ]
with open('best_model_eval_metrics.txt', 'w') as filehandle:
    json.dump(eval_scores, filehandle)

### Get keywords from sentence

In [None]:
# def keywordextract(sentence):
#     text = sentence
#     tkns = tokenizer.tokenize(text)
#     indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
#     segments_ids = [0] * len(tkns)
#     tokens_tensor = torch.tensor([indexed_tokens]).to(device)
#     segments_tensors = torch.tensor([segments_ids]).to(device)
#     model.eval()
#     prediction = []
#     logit = model(tokens_tensor, token_type_ids=None,
#                                   attention_mask=segments_tensors)
#     logit = logit.detach().cpu().numpy()
#     prediction.extend([list(p) for p in np.argmax(logit, axis=2)])
#     for k, j in enumerate(prediction[0]):
#         if j==1 or j==0:
#             print(tokenizer.convert_ids_to_tokens(tokens_tensor[0].to('cpu').numpy())[k], j)

In [None]:
# text = "The solution is based upon an abstract representation of the mobile object system."

In [None]:
# keywordextract(text)