In [1]:
import os
import sys

In [2]:
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [3]:
import datasets
from transformers import RobertaTokenizer

from hydra import compose, initialize
from omegaconf import OmegaConf

In [4]:
path_config="../config"
path_save_raw_dataset = "../data/dataset/raw"
path_save_process_dataset = "../data/dataset/processed"

In [5]:
initialize(version_base=None, config_path=path_config)
cfg = compose(config_name="experiment")
print(OmegaConf.to_yaml(cfg))

model:
  ner_lin_size: 256
  count_tags: 10
  dropout: 0.2
  hidden_size_adapter: 32
processing:
  data:
    max_length: 512
train:
  epoch: 8
  batch_sizes:
  - 32
  - 64
  - 128
  - 256
  batch_milestones:
  - 2
  - 4
  - 7
  lr_0: 0.001
  lr_milestones:
  - 2
  - 4
  - 7
  gamma: 0.464159
  epoch_emb_requires_grad: 4
  print_step: 100
name: 1.0-RoBERTa-adapters
mlflow_server: http://127.0.0.1:5000
seed: 42





In [6]:
dataset = datasets.load_dataset("eriktks/conll2003", trust_remote_code=True)
dataset.save_to_disk(path_save_raw_dataset)

Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", truncation=True, add_prefix_space=True)

In [8]:
MAX_LENGTH = cfg.processing.data.max_length

In [9]:
def tokenize_text(batch):
  tokens = list(map(lambda x: " ".join(x), batch["tokens"]))
  d = tokenizer(tokens,
                   truncation=True,
                   padding='max_length',
                   return_token_type_ids=True,
                   max_length=MAX_LENGTH)
  d["len"] = list(map(len, batch["ner_tags"]))
  d["ner_tags"] = list(map(lambda x: list(map(lambda y: y+1, x)), batch["ner_tags"]))
  d["ner_tags"] = [x[:MAX_LENGTH] if len(x) > MAX_LENGTH else x + [0] * (MAX_LENGTH - len(x)) for x in batch["ner_tags"]]
  return d

In [10]:
dataset = dataset.map(tokenize_text, batched=True)

In [11]:
dataset = dataset.remove_columns(["id", "tokens", "chunk_tags", "pos_tags"])

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'len'],
        num_rows: 3453
    })
})

In [13]:
columns=["ner_tags", "input_ids", "attention_mask", "token_type_ids", "len"]

dataset.set_format("torch", columns=columns)

In [15]:
dataset.save_to_disk(path_save_process_dataset)

Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3453 [00:00<?, ? examples/s]