In [1]:
import os
import sys

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
import datasets
from torch.utils.data import DataLoader
import torch

from functools import partial
import time

from hydra import compose, initialize
from omegaconf import OmegaConf

from src.dataset import RoBERTaDataset
from src.model import RoBERTaNER, RoBERTaAdapter, roberta_embedding_size
from src.train import RoBERTaTrainer

In [4]:
path_config="../config"
path_load_process_dataset = "../data/dataset/processed"

In [5]:
initialize(version_base=None, config_path=path_config)
cfg = compose(config_name="experiment")
print(OmegaConf.to_yaml(cfg))

model:
  ner_lin_size: 256
  count_tags: 10
  dropout: 0.2
  hidden_size_adapter: 32
processing:
  data:
    max_length: 512
train:
  epoch: 3
  batch_size: 16
  lr: 1.0e-05
  print_step: 100
name: 1.0-RoBERTa-adapters
mlflow_server: http://127.0.0.1:5000
seed: 42





In [6]:
dataset = datasets.load_from_disk(path_load_process_dataset)
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 3453
    })
})

In [7]:
train_dataset = dataset['train'].shuffle(seed=42)
test_dataset = dataset['test'].shuffle(seed=42)

In [8]:
train_data = RoBERTaDataset(train_dataset)
test_data = RoBERTaDataset(test_dataset)

In [9]:
BATCH_SIZE = cfg.train.batch_size

In [10]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
count_tags = cfg.model.count_tags

In [13]:
model = RoBERTaNER(cfg.model.ner_lin_size, count_tags, cfg.model.dropout)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
for name, param in model.named_parameters():
  if "lin." not in name:
    param.requires_grad = False

In [15]:
r = cfg.model.hidden_size_adapter
h = roberta_embedding_size

assign = partial(RoBERTaAdapter, r=r, h=h)

In [16]:
for layer in model.roberta.encoder.layer:
  layer.output = assign(layer.output)

In [17]:
model

RoBERTaNER(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [18]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

roberta.encoder.layer.0.output.lin0.weight
roberta.encoder.layer.0.output.lin0.bias
roberta.encoder.layer.0.output.lin1.weight
roberta.encoder.layer.0.output.lin1.bias
roberta.encoder.layer.1.output.lin0.weight
roberta.encoder.layer.1.output.lin0.bias
roberta.encoder.layer.1.output.lin1.weight
roberta.encoder.layer.1.output.lin1.bias
roberta.encoder.layer.2.output.lin0.weight
roberta.encoder.layer.2.output.lin0.bias
roberta.encoder.layer.2.output.lin1.weight
roberta.encoder.layer.2.output.lin1.bias
roberta.encoder.layer.3.output.lin0.weight
roberta.encoder.layer.3.output.lin0.bias
roberta.encoder.layer.3.output.lin1.weight
roberta.encoder.layer.3.output.lin1.bias
roberta.encoder.layer.4.output.lin0.weight
roberta.encoder.layer.4.output.lin0.bias
roberta.encoder.layer.4.output.lin1.weight
roberta.encoder.layer.4.output.lin1.bias
roberta.encoder.layer.5.output.lin0.weight
roberta.encoder.layer.5.output.lin0.bias
roberta.encoder.layer.5.output.lin1.weight
roberta.encoder.layer.5.output.li

In [19]:
lr = cfg.train.lr
EPOCHS = cfg.train.epoch

In [20]:
trainer = RoBERTaTrainer(model, lr)

In [None]:
trainer.train(train_loader, EPOCHS)

In [None]:
trainer.evaluate(test_loader)

KeyboardInterrupt: 