# Imports

In [1]:
import sys
sys.path.append('../')

In [2]:
import os

In [3]:
from tqdm import tqdm_notebook as tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter

from transformers.tokenization_bert import BertTokenizer
from transformers.modeling_bert import BertForTokenClassification, BertConfig, BertModel

In [24]:
from mlpack.datasets.conll2003 import get_conll2003, get_conll2003_features, convert_examples_to_features_masked
from mlpack.datasets.conll2003 import CoNLL2003Dataset
from mlpack.bert.ner.model import BertForMaskedNERClassification
from mlpack.bert.ner.train import train
from mlpack.bert.ner.utils import to_fp16

# Tokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Data

In [6]:
examples, labels = get_conll2003('../datasets/CoNLL2003/')

In [7]:
label_map = {i:l for i, l in enumerate(labels, 0)}
label_map

{0: '[PAD]',
 1: 'O',
 2: 'B-MISC',
 3: 'I-MISC',
 4: 'B-PER',
 5: 'I-PER',
 6: 'B-ORG',
 7: 'I-ORG',
 8: 'B-LOC',
 9: 'I-LOC',
 10: '[CLS]',
 11: '[SEP]',
 12: 'X'}

In [8]:
# features_train = convert_examples_to_features(examples['train'], labels, 128, tokenizer, sep_tag='X')

In [9]:
features_valid = convert_examples_to_features_masked(examples['valid'][:5], labels, 128, tokenizer)

In [10]:
# features = get_conll2003_features(examples, labels, 128, tokenizer, sep_tag='same')

In [11]:
# features.keys()

# Checking

In [10]:
idx = 0
ex, feat = examples['valid'][idx], features_valid[idx]

In [11]:
zipped = zip(tokenizer.convert_ids_to_tokens(feat.input_ids), feat.label_mask,
            feat.input_mask)
for tok, lm, im in zipped:
    print(f'{tok:10} {lm} {im}')

[CLS]      0 1
[MASK]     1 1
-          0 1
L          0 1
##EI       0 1
##CE       0 1
##ST       0 1
##ER       0 1
##S        0 1
##H        0 1
##IR       0 1
##E        0 1
T          0 1
##A        0 1
##KE       0 1
O          0 1
##VE       0 1
##R        0 1
AT         0 1
TO         0 1
##P        0 1
A          0 1
##FT       0 1
##ER       0 1
IN         0 1
##NI       0 1
##NG       0 1
##S        0 1
VI         0 1
##CT       0 1
##OR       0 1
##Y        0 1
.          0 1
[SEP]      0 1
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]      0 0
[PAD]     

# Dataset

In [12]:
# ds_train = NERDataset(features_train)
ds_valid = CoNLL2003Dataset(features_valid)

In [13]:
# dl_train = DataLoader(ds_train, batch_size=1, pin_memory=True, shuffle=True,  num_workers=0)
dl_valid = DataLoader(ds_valid, batch_size=1, pin_memory=True, shuffle=False, num_workers=0)

In [14]:
input_ids, input_mask, label_ids, label_mask = next(iter(dl_valid))

# Evaluating

In [15]:
def evaluate_fn(model, dataloader):
    model.eval()
    losses, accs = [], []
    y_trues, y_preds = [], []
    for input_ids, input_mask, label_ids, label_mask in tqdm(dataloader, desc='Evaluating', leave=False):
        input_ids, input_mask, label_ids, label_mask = to_device(input_ids, input_mask, label_ids,
                                                                 label_mask, device=device)
        with torch.no_grad():
            loss, active_logits, active_labels = model(
                input_ids, input_mask, label_ids, label_mask)
            
        losses.append(loss.item())
        
        active_logits = active_logits.argmax(dim=1).cpu().numpy()
        active_labels = active_labels.cpu().numpy()
        accs = (1 * (active_logits == active_labels)).tolist()
        
        # transforming
#         ts, ps = remap(input_ids, input_mask, label_ids, label_mask, active_logits, active_labels)
#         y_preds += ps
#         y_trues += ts
        
#     print(y_preds, y_trues)
#     print(classification_report(y_trues, y_preds))
            
    return np.array(losses).mean(), np.array(accs).mean()

# Bert Model

Lets do this with the X tag for training and evaluation

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

In [17]:
LABELS = [
    l for l in labels if l not in ['[PAD]', '[CLS]', '[SEP]', 'X']
]
LABELS

['O', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [18]:
config = BertConfig.from_pretrained('bert-base-cased', num_labels=len(LABELS), output_hidden_states=True)

In [19]:
model = BertForMaskedNERClassification(config)

In [20]:
model(input_ids, input_mask, label_ids, label_mask)

(tensor(2.4884, grad_fn=<NllLossBackward>),
 tensor([[-0.0427,  0.0519,  0.5949,  0.7669,  0.5186, -0.0769, -0.4152,  0.0637,
           0.8644]], grad_fn=<IndexBackward>),
 tensor([1]))

In [27]:
model.to(device)

BertForNERClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, weight_decay=0)

# Training

In [22]:
scheduler = None

In [23]:
class Args:
    device = device
    fp16 = True
    num_epochs = 10
    ckp_path = 'bertner.ckp'
    grad_steps = 1
    max_grad_norm = 1.
    load_state_dict = False
    n_iter = 0
    best_acc = None
    writer = SummaryWriter('nerbert')
args = Args()

In [26]:
if args.fp16 and args.n_iter == 0:
    model, optimizer = to_fp16(model, optimizer)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


RuntimeError: Found param bert.embeddings.word_embeddings.weight with type torch.FloatTensor, expected torch.cuda.FloatTensor.
When using amp.initialize, you need to provide a model with parameters
located on a CUDA device before passing it no matter what optimization level
you chose. Use model.to('cuda') to use the default device.

In [27]:
if args.load_state_dict:
    if os.path.exists(args.ckp_path):
        print(model.load_state_dict(torch.load(args.ckp_path)))
#     if os.path.exists(args.ckp_path.replace('.ckp', '_optimizer.ckp')):
#         optimizer.load_state_dict(torch.load(args.ckp_path.replace('.ckp', '_optimizer.ckp'), map_location='cpu'))

In [34]:
train(ars, models, dl_train, dl_valid, optimizer, evaluate_fn=evaluate_fn)