In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from model_new import BertForTokenClassification
import utils.NERutils as nu

from transformers import AutoConfig, AutoTokenizer

#### Define tokenizer

In [2]:
bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

#### Load Datasets

In [3]:
train_path = "data/train.parquet"
dev_path = "data/dev.parquet"
test_path = "data/test.parquet"

In [4]:
#train_dataset = nu.NERdataset(dataset_path=train_path, tokenizer=bert_tokenizer)
#dev_dataset = nu.NERdataset(dataset_path=dev_path, tokenizer=bert_tokenizer)
train_dataset = nu.NERdataset(dataset_path=test_path, tokenizer=bert_tokenizer)

#### Get pretrained model

In [6]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [15]:
# Config
bert_model_name = "bert-base-multilingual-cased"
bert_config = AutoConfig.from_pretrained(
    bert_model_name, 
    num_labels=len(train_dataset.tags), 
    id2label=train_dataset.index2tag, 
    label2id=train_dataset.tag2index,
    # Documentation for choice of dropout rate
    # https://link.springer.com/chapter/10.1007/978-3-030-76508-8_11?fbclid=IwAR38oKZ3d1Acz_1CNUoREB_RaLxtWkubFz3fqDiF6sx7FinAHV5YCjmsjuE_aem_AdkEEeEHsMn0sKCFV8Dbd_TmM2axzkTqA3mSvxLfqS_wxh8aSumAfKgDi1ciNn8o05-GDI_VC42loqlrs-HKi8Ui
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.5
)

model = BertForTokenClassification.from_pretrained(bert_model_name, config=bert_config, tags=train_dataset.tags).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Finetune

In [8]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0,
                }

data_loader = torch.utils.data.DataLoader(train_dataset, **train_params)

In [19]:
len(data_loader.dataset)

1462

In [12]:
num_epochs = 1
learning_rate = 1e-05
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

model.fit(num_epochs, data_loader, device, optimizer)

  0%|          | 0/183 [00:00<?, ?it/s]

  1%|          | 1/183 [01:44<5:18:09, 104.89s/it]

2.3854479789733887
92.0


  1%|          | 1/183 [03:37<11:00:25, 217.72s/it]


KeyboardInterrupt: 

In [13]:
model.training_acc

[]

In [14]:
model.training_loss

[]