In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from model_new import BertForTokenClassification
import utils.NERutils as nu
from torch.utils.data import SubsetRandomSampler

from transformers import AutoConfig, AutoTokenizer

#### Define tokenizer

In [2]:
bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

#### Load Datasets

In [3]:
train_path = "data/train.parquet"
dev_path = "data/dev.parquet"
test_path = "data/test.parquet"

In [4]:
#train_dataset = nu.NERdataset(dataset_path=train_path, tokenizer=bert_tokenizer)
#dev_dataset = nu.NERdataset(dataset_path=dev_path, tokenizer=bert_tokenizer)
train_dataset = nu.NERdataset(dataset_path=test_path, tokenizer=bert_tokenizer)

#### Get pretrained model

In [5]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [6]:
# Config
bert_model_name = "bert-base-multilingual-cased"
bert_config = AutoConfig.from_pretrained(
    bert_model_name, 
    num_labels=len(train_dataset.tags), 
    id2label=train_dataset.index2tag, 
    label2id=train_dataset.tag2index,
    # Documentation for choice of dropout rate
    # https://link.springer.com/chapter/10.1007/978-3-030-76508-8_11?fbclid=IwAR38oKZ3d1Acz_1CNUoREB_RaLxtWkubFz3fqDiF6sx7FinAHV5YCjmsjuE_aem_AdkEEeEHsMn0sKCFV8Dbd_TmM2axzkTqA3mSvxLfqS_wxh8aSumAfKgDi1ciNn8o05-GDI_VC42loqlrs-HKi8Ui
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.5
)

model = BertForTokenClassification.from_pretrained(bert_model_name, config=bert_config, tags=train_dataset.tags).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Finetune

In [11]:
train_params = {'batch_size': 4,
                #'shuffle': True,
                'num_workers': 0,
                'sampler': SubsetRandomSampler([i for i in range(100)])
                }

data_loader = torch.utils.data.DataLoader(train_dataset, **train_params)

In [12]:
len(data_loader.dataset)

1462

In [19]:
num_epochs = 1
learning_rate = 1e-05
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

model.fit(num_epochs, data_loader, data_loader, device, optimizer)

Train


  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [04:31<00:00, 10.86s/it]
  4%|▍         | 1/25 [05:13<2:05:18, 313.29s/it]

[tensor([[ 0.4185, -0.1953, -0.4587,  ...,  0.1215, -0.0791,  0.3905],
        [ 0.4023, -0.1668, -0.4081,  ...,  0.0319,  0.1861, -0.1684],
        [ 0.3278, -0.2758, -0.5233,  ...,  0.0880, -0.1980,  0.0895],
        ...,
        [ 0.4437, -0.3013, -0.5109,  ...,  0.0279, -0.0147,  0.1379],
        [ 0.4247, -0.3002, -0.4987,  ..., -0.0229,  0.0181,  0.1188],
        [ 0.3160, -0.2320, -0.4336,  ..., -0.0785,  0.0956,  0.0866]]), tensor([[ 0.2654, -0.2520, -0.4183,  ...,  0.1913, -0.2865,  0.2810],
        [-0.2498, -0.2477, -0.2435,  ...,  0.1696, -0.4233,  0.1498],
        [-0.5347, -0.2223, -0.0689,  ...,  0.2379, -0.1189, -0.0452],
        ...,
        [ 0.3620, -0.3987, -0.4511,  ...,  0.0840, -0.2531,  0.0631],
        [ 0.3712, -0.3884, -0.4515,  ...,  0.0832, -0.2584,  0.0668],
        [ 0.3591, -0.3651, -0.4493,  ...,  0.0888, -0.2505,  0.0882]]), tensor([[ 0.0584, -0.1744, -0.2378,  ...,  0.2644, -0.3350,  0.4764],
        [-0.4137, -0.2719, -0.4093,  ...,  0.3626, -0.0665,

 28%|██▊       | 7/25 [01:29<03:50, 12.79s/it]
  4%|▍         | 1/25 [07:07<2:50:52, 427.18s/it]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\idawe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\idawe\AppData\Local\Temp\ipykernel_12476\1189977983.py", line 5, in <module>
    model.fit(num_epochs, data_loader, data_loader, device, optimizer)
  File "c:\Users\idawe\Desktop\GitHub\CrossNER-Active-Learning\model_new.py", line 295, in fit
    print("Train")
  File "c:\Users\idawe\Desktop\GitHub\CrossNER-Active-Learning\model_new.py", line 228, in train_loop_cartography
    print("Cartography calculations")
                             ^^^^^^^^^
  File "c:\Users\idawe\Desktop\GitHub\CrossNER-Active-Learning\model_new.py", line 342, in predict
  File "C:\Users\idawe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site

In [22]:
for batch in data_loader:
    print(batch["index"])

tensor([96, 85,  3, 58])
tensor([97, 64, 69, 16])
tensor([62,  1, 20, 89])
tensor([61, 12, 71, 80])
tensor([90, 79,  6, 74])
tensor([39, 91, 67, 41])
tensor([48, 40, 86, 57])
tensor([76, 26, 28, 70])
tensor([68, 43, 44, 10])
tensor([75, 36, 54, 53])
tensor([14, 94, 21, 59])
tensor([ 8, 35, 42, 17])
tensor([83, 18, 82, 29])
tensor([93,  7, 77,  9])
tensor([78, 65, 66,  4])
tensor([60,  2, 52, 98])
tensor([34, 15, 87, 47])
tensor([11, 32, 73, 25])
tensor([38, 50, 19, 13])
tensor([72, 63, 81, 45])
tensor([27, 51, 95, 23])
tensor([33, 31, 24, 92])
tensor([88, 30, 22, 99])
tensor([55, 56,  0, 84])
tensor([37, 49,  5, 46])


In [23]:
for item in data_loader:
    print(item)

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'input_ids': tensor([[  101,   148, 10774,  ...,     0,     0,     0],
        [  101, 18082, 10162,  ...,     0,     0,     0],
        [  101,   100, 19441,  ...,     0,     0,     0],
        [  101, 30797, 12754,  ...,     0,     0,     0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'index': tensor([ 8, 17, 74, 35])}
{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'input_ids': tensor([[  101,   144, 10451,  ...,     0,     0,     0],
        [  101,   145, 16678,

In [14]:
for batch in data_loader:

Unnamed: 0,text,ents,sents,tokens,spans,dagw_source,dagw_domain,dagw_source_full
0,Henrik Dahl: Feminister er rene og skære nasse...,"[{'start': 0, 'end': 11, 'label': 'PERSON'}, {...","[{'start': 0, 'end': 53}]","[{'id': 0, 'start': 0, 'end': 6}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
1,27 meters frit fald fra Operaens tag –,"[{'start': 0, 'end': 9, 'label': 'QUANTITY'}, ...","[{'start': 0, 'end': 38}]","[{'id': 0, 'start': 0, 'end': 2}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
2,det må\n,[],"[{'start': 0, 'end': 7}]","[{'id': 0, 'start': 0, 'end': 3}, {'id': 1, 's...",{'incorrect_spans': []},retspraksis,Legal,retspraksis (Danish legal information)
3,Taler 9: jeg er mest på 1,"[{'start': 6, 'end': 7, 'label': 'CARDINAL'}, ...","[{'start': 0, 'end': 25}]","[{'id': 0, 'start': 0, 'end': 5}, {'id': 1, 's...",{'incorrect_spans': []},spont,Conversation,Spontaneous speech
4,25/9:9:00 - 16:30,"[{'start': 12, 'end': 17, 'label': 'TIME'}]","[{'start': 0, 'end': 17}]","[{'id': 0, 'start': 0, 'end': 9}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
...,...,...,...,...,...,...,...,...
1457,september 2018 02,"[{'start': 0, 'end': 14, 'label': 'DATE'}, {'s...","[{'start': 0, 'end': 17}]","[{'id': 0, 'start': 0, 'end': 9}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
1458,Spillestedet er Kolind+ / Kolind Hallen,"[{'start': 16, 'end': 23, 'label': 'FACILITY'}...","[{'start': 0, 'end': 39}]","[{'id': 0, 'start': 0, 'end': 12}, {'id': 1, '...",{'incorrect_spans': []},cc,Web,Common Crawl
1459,Hæsinge Kirke med Karen Møller som prædikant o...,"[{'start': 0, 'end': 13, 'label': 'FACILITY'},...","[{'start': 0, 'end': 61}]","[{'id': 0, 'start': 0, 'end': 7}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
1460,"Gtx® 1169,00 Kr Gem Nyheder Allrounder by Meph...","[{'start': 5, 'end': 15, 'label': 'MONEY'}, {'...","[{'start': 0, 'end': 94}]","[{'id': 0, 'start': 0, 'end': 3}, {'id': 1, 's...",{'incorrect_spans': []},cc,Web,Common Crawl
