In [1]:
from custom_datasets.mlm_ft_dataset import MLMFTDataset
from transformers import (
    BertTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    logging,
)

from readers import lenta_reader, ria_reader, tg_reader

import torch
import tqdm
import wandb

In [2]:
model_path = '/home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/'

In [3]:
logging.set_verbosity_info()

In [4]:
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False, do_basic_tokenize=False)
model = AutoModelForMaskedLM.from_pretrained(model_path)
model.resize_token_embeddings(len(tokenizer))

Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/added_tokens.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/special_tokens_map.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/tokenizer_config.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/tokenizer.json. We won't load it.
loading file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max

Embedding(119547, 768, padding_idx=0)

In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

## TG

In [6]:
tg_records = [r for r in tqdm.tqdm(tg_reader('/home/aobuhtijarov/datasets/telegram_news/ru_tg_1101_0510.jsonl'))]

484037it [04:55, 1638.17it/s]


484037

In [8]:
tg_records.extend(
    [r for r in tqdm.tqdm(tg_reader('/home/aobuhtijarov/datasets/telegram_news/ru_tg_0511_0517.jsonl'))]
)

120050it [01:14, 1604.76it/s]


In [9]:
full_dataset = MLMFTDataset([t['text'] for t in tg_records], tokenizer)
len(full_dataset)

604087

## LentaRIA

In [6]:
lenta_records = [r for r in tqdm.tqdm(lenta_reader('/home/aobuhtijarov/datasets/lenta/lenta-ru-news.val.csv'))]


ria_records = [r for r in tqdm.tqdm(ria_reader(
    '/home/aobuhtijarov/datasets/ria/ria.shuffled.train.json'))]

lenta_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']]



75971it [00:03, 24049.66it/s]
858741it [10:59, 1302.21it/s]


In [7]:
full_dataset = MLMFTDataset([t['text'] for t in lenta_records + ria_records], tokenizer)

## Split and train

In [10]:
train_fraq = 0.97

train_size = int(train_fraq * len(full_dataset))
test_size = int((1-train_fraq) * 0.5 * len(full_dataset))

train_dataset, test_dataset, eval_dataset = \
    torch.utils.data.random_split(full_dataset, [train_size, test_size, len(full_dataset) - train_size - test_size])


In [11]:
wandb.login()
wandb.init(project='master-thesis', name='RuBERT TG fine tuning on text');

wandb: Currently logged in as: leshanbog (use `wandb login --relogin` to force relogin)


In [12]:
wandb.run.config.per_device_train_batch_size = 4
wandb.run.config.gradient_accumulation_steps = 16
wandb.run.config.learning_rate = 3e-5
wandb.run.config.warmup_steps = 500
wandb.run.config.logging_steps = 25
wandb.run.config.eval_steps = 100
wandb.run.config.save_steps = 100
wandb.run.config.max_steps = 2000
wandb.run.config.weight_decay = 0.01

In [13]:
training_args = TrainingArguments(
    output_dir='./bert_ft_on_tg_text',
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=wandb.run.config.per_device_train_batch_size,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=wandb.run.config.gradient_accumulation_steps,
    evaluation_strategy='steps',
    learning_rate=wandb.run.config.learning_rate,
    warmup_steps=wandb.run.config.warmup_steps,
    overwrite_output_dir=False,
    logging_steps=wandb.run.config.logging_steps,
    eval_steps=wandb.run.config.eval_steps,
    save_steps=wandb.run.config.save_steps,
    max_steps=wandb.run.config.max_steps,
    save_total_limit=1,
    weight_decay=wandb.run.config.weight_decay,
    report_to='wandb',
)

PyTorch: setting up devices


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

***** Running training *****
  Num examples = 585964
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 2000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


In [None]:
res = trainer.evaluate(eval_dataset=test_dataset)

In [None]:
wandb.summary.update({'Test eval': res})

In [None]:
wandb.finish()