In [None]:
!pip install fastNLP==1.0.1 -q
!pip install transformers -q
!pip install sparse==0.13.0 -q
!pip install torch==1.11.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 -q
!pip install llvmlite==0.38.1 --user -q
!pip install numba==0.55.2 --user -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cached-path 1.1.6 requires rich<13.0,>=12.1, but you have rich 11.2.0 which is incompatible.[0m[31m
[0m

In [None]:
import torch
import fastNLP
import transformers
import sparse
import numba
import llvmlite

print(fastNLP.__version__)
print(transformers.__version__)
print(sparse.__version__)
print(torch.__version__)
print(numba.__version__)
print(llvmlite.__version__)

  warn(f"Failed to load image Python extension: {e}")


1.0.1
4.20.1
0.13.0
1.11.0+cu113
0.55.2
0.38.1


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [None]:
!pip install torch-scatter --no-index -f https://data.pyg.org/whl/torch-{torch.__version__}.html 

Looking in links: https://data.pyg.org/whl/torch-1.11.0+cu113.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
[0m

In [None]:
import json
import os
import warnings
import argparse

if 'p' in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['p']
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
warnings.filterwarnings('ignore')

import numpy as np
import fastNLP
from fastNLP import cache_results, prepare_torch_dataloader
from fastNLP import print
from fastNLP import Trainer, Evaluator
from fastNLP import TorchGradClipCallback, MoreEvaluateCallback, LoadBestModelCallback
from fastNLP import FitlogCallback
from fastNLP import SortedSampler, BucketedBatchSampler
from fastNLP import TorchWarmupCallback


from model.model import CNNNer
from model.metrics import NERMetric
from preprocessing.ner_pipe import SpanNerPipe
from preprocessing.padder import Torch3DMatrixPadder

seed_value=42
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
os.environ['FASTNLP_GLOBAL_SEED'] = str(seed_value)

### Data statistics

In [None]:
import argparse
import json

folder = 'datasets/outputs/nerel_common_labels'

def calculate(path):
    max_sent_len = 0
    total_lengths = 0
    total_ent_length = 0
    max_ent_length = 0
    num_ents = 0
    overlapped_ent_num = 0
    num_sent = 0
    with open(path, 'r') as f:
        for line in f:
            num_sent += 1
            data = json.loads(line.strip())
            max_sent_len = max(max_sent_len, len(data['tokens']))
            total_lengths += len(data['tokens'])
            flags = [0]*len(data['tokens'])
            for ent in data['entity_mentions']:
                num_ents += 1
                start, end = ent['start'], ent['end']
                total_ent_length += ent['end'] - ent['start']
                max_ent_length = max(max_ent_length, ent['end'] - ent['start'])
                for i in range(start, end):
                    flags[i] += 1
            for ent in data['entity_mentions']:
                start, end = ent['start'], ent['end']
                if any([flags[i]>1 for i in range(start, end)]):
                    overlapped_ent_num += 1

    print(f"For {path}")
    print("total sentence ", num_sent)
    print("average sentence length ", total_lengths/num_sent)
    print("max sentence length ", max_sent_len)

    print('num_entities ', num_ents)
    print('average entity length  ', total_ent_length/num_ents)
    print('max entity length  ', max_ent_length)
    print("Number of nested entity ", overlapped_ent_num)

    print("Number of tokens ", total_lengths)
    print()


for name in ['train', 'dev', 'test']:
    path = f'{folder}/{name}.jsonl'
    calculate(path)

### Training

In [None]:
###### HYPERPARAMS ######
lr = 5e-6
batch_size = 4
n_epochs = 10
warmup = 0.1
dataset_name = 'nerel_common_labels'
model_name = 'DeepPavlov/rubert-base-cased'
# model_name = 'cointegrated/rubert-tiny2'
cnn_depth = 3
cnn_dim = 100
logit_drop = 0
biaffine_size = 200
n_head = 4
accumulation_steps = 1
########################
non_ptm_lr_ratio = 100
schedule = 'linear'
weight_decay = 1e-2
size_embed_dim = 25
ent_thres = 0.5
kernel_size = 3
########################



@cache_results('caches/ner_caches.pkl', _refresh=False)
def get_data(dataset_name, model_name):
    paths = f'datasets/outputs/{dataset_name}'
    pipe = SpanNerPipe(model_name=model_name)
    dl = pipe.process_from_file(paths)
    return dl, pipe.matrix_segs


dl, matrix_segs = get_data(dataset_name, model_name)

def densify(x):
    x = x.todense().astype(np.float32)
    return x


dl.apply_field(densify, field_name='matrix', new_field_name='matrix', progress_bar='Densify')

print(dl)

label2idx = getattr(dl, 'ner_vocab') if hasattr(dl, 'ner_vocab') else getattr(dl, 'label2idx')
print(f"{len(label2idx)} labels: {label2idx}, matrix_segs:{matrix_segs}")


dls = {}
for name, ds in dl.iter_datasets():
    ds.set_pad('matrix', pad_fn=Torch3DMatrixPadder(pad_val=ds.collator.input_fields['matrix']['pad_val'],
                                                    num_class=matrix_segs['ent'],
                                                    batch_size=batch_size))

    if name == 'train':
        _dl = prepare_torch_dataloader(ds, batch_size=batch_size, num_workers=0,
                                       batch_sampler=BucketedBatchSampler(ds, 'input_ids',
                                                                          batch_size=batch_size,
                                                                          num_batch_per_bucket=30),
                                       pin_memory=True, shuffle=True)

    else:
        _dl = prepare_torch_dataloader(ds, batch_size=batch_size, num_workers=0,
                                       sampler=SortedSampler(ds, 'input_ids'), pin_memory=True, shuffle=False)
    dls[name] = _dl

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

                                                            

In [None]:
model = CNNNer(model_name, num_ner_tag=matrix_segs['ent'], cnn_dim=cnn_dim, biaffine_size=biaffine_size,
               size_embed_dim=size_embed_dim, logit_drop=logit_drop,
               kernel_size=kernel_size, n_head=n_head, cnn_depth=cnn_depth)


# optimizer
parameters = []
ln_params = []
non_ln_params = []
non_pretrain_params = []
non_pretrain_ln_params = []

import collections

counter = collections.Counter()
for name, param in model.named_parameters():
    counter[name.split('.')[0]] += torch.numel(param)
print(counter)
print("Total param ", sum(counter.values()))

for name, param in model.named_parameters():
    name = name.lower()
    if param.requires_grad is False:
        continue
    if 'pretrain_model' in name:
        if 'norm' in name or 'bias' in name:
            ln_params.append(param)
        else:
            non_ln_params.append(param)
    else:
        if 'norm' in name or 'bias' in name:
            non_pretrain_ln_params.append(param)
        else:
            non_pretrain_params.append(param)

  
optimizer = torch.optim.AdamW([{'params': non_ln_params, 'lr': lr, 'weight_decay': weight_decay},
                               {'params': ln_params, 'lr': lr, 'weight_decay': 0},
                               {'params': non_pretrain_ln_params, 'lr': lr * non_ptm_lr_ratio, 'weight_decay': 0},
                               {'params': non_pretrain_params, 'lr': lr * non_ptm_lr_ratio,
                                'weight_decay': weight_decay}])
# callbacks
callbacks = []
callbacks.append(TorchGradClipCallback(clip_value=5))
callbacks.append(TorchWarmupCallback(warmup=warmup, schedule=schedule))
callbacks.append(LoadBestModelCallback())

evaluate_dls = {}
if 'dev' in dls:
    evaluate_dls = {'dev': dls.get('dev')}
if 'test' in dls:
    evaluate_dls['test'] = dls['test']

allow_nested = True
metrics = {'f': NERMetric(matrix_segs=matrix_segs, ent_thres=ent_thres, allow_nested=allow_nested)}

trainer = Trainer(model=model,
                  driver='torch',
                  train_dataloader=dls.get('train'),
                  evaluate_dataloaders=evaluate_dls,
                  optimizers=optimizer,
                  callbacks=callbacks,
                  overfit_batches=0,
                  device=0,
                  n_epochs=n_epochs,
                  metrics=metrics,
                  monitor='f#f#dev',
                  evaluate_every=-1,
                  evaluate_use_dist_sampler=True,
                  accumulation_steps=accumulation_steps,
                  fp16=False,
                  progress_bar='tqdm')


trainer.run(num_train_batch_per_epoch=-1, num_eval_batch_per_dl=-1, num_eval_sanity_batch=1)
torch.save(model, 'weights/model_best.pkl')

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Eval. on dev Batch:   0%|          | 0/1 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch:0:   0%|          | [00:00<?, ?it/s, ]

Batch:   0%|          | 0/996 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]

Eval. on dev Batch:   0%|          | 0/119 [00:00<?, ?it/s]

Eval. on test Batch:   0%|          | 0/118 [00:00<?, ?it/s]