In [1]:
import argparse
import json
import random
import tqdm
import torch
import wandb
import numpy as np
import os
from torch.utils.data import Dataset

from _jsonnet import evaluate_file as jsonnet_evaluate_file
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, logging

from readers import tg_reader, ria_reader, lenta_reader
from custom_datasets import AgencyTitleDatasetClassification, LentaRiaDatasetClassification
from utils.training_utils import get_separate_lr_optimizer, init_wandb

In [2]:
class LentaRiaPairedDatasetClassification(Dataset):
    def __init__(
        self,
        path,
        tokenizer,
        cur_slice,
        max_tokens=100,
    ):
        with open(path, 'r') as f:
            self.records = [json.loads(x.strip()) for x in f.readlines()]
            assert len(self.records) == 75362
            self.records = self.records[cur_slice]

        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.records) * 2

    def __getitem__(self, index):
        record = self.records[index // 2]
        if index % 2 == 1:
            a1 = record['ria_title']
            a2 = record['lenta_title']
            target = 1
        else:
            a1 = record['lenta_title']
            a2 = record['ria_title']
            target = 0

        inputs = self.tokenizer(
            ' [SEP] '.join([a1, a2]),
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation=True
        )

        return {
            "input_ids": torch.tensor(inputs["input_ids"]),
            "attention_mask": torch.tensor(inputs["attention_mask"]),
            "labels": target
        }

    def get_strings(self, index):
        record = self.records[index // 2]
        if index % 2 == 1:
            a1 = record['ria_title']
            a2 = record['lenta_title']
        else:
            a1 = record['lenta_title']
            a2 = record['ria_title']
        return {
            'title': ' [SEP] '.join([a1, a2]),
            'date': record['lenta_date'],
        }


In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': np.mean(labels == preds)
    }

In [4]:
logging.set_verbosity_info()


In [5]:
train_file = '../../datasets/full_lenta_ria.train.jsonl'
model_path = '/home/aobuhtijarov/master-thesis/src/bert_ft_on_tg_text/checkpoint-2000/'
output_model_path = '/home/aobuhtijarov/models/paired_clf_from_rubert'

In [6]:
c = '''{
    "tokenizer_model_path": "/home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/",
    "agency_list": ["РИА Новости", "lenta.ru"],
    "max_tokens_text": 250,
    "max_tokens_title": 48,
    "gradient_accumulation_steps": 64,
    "batch_size": 4,
    "eval_steps": 250,
    "save_steps": 500,
    "logging_steps": 100,
    "learning_rate": 0.00003,
    "num_warmup_steps": 1500,
    "max_steps": 5000,
}
'''

In [7]:
with open('temp_conf.jsonnet', 'w') as f:
    f.write(c)

In [8]:
config = json.loads(jsonnet_evaluate_file('temp_conf.jsonnet'))

In [9]:
init_wandb('paired-discriminator-from-tg-text', config)

wandb: Currently logged in as: leshanbog (use `wandb login --relogin` to force relogin)


In [10]:
agency_list = config['agency_list']
print('Agency list:', agency_list)

max_tokens_text = config["max_tokens_text"]
max_tokens_title = config["max_tokens_title"]

tokenizer_model_path = config["tokenizer_model_path"]
tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/added_tokens.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/special_tokens_map.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/tokenizer_config.json. We won't load it.
Didn't find file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/tokenizer.json. We won't load it.
loading file /home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/vocab.txt
loading file None
loading file None
loading file None
loading file None


Agency list: ['РИА Новости', 'lenta.ru']


In [11]:
train_dataset = LentaRiaPairedDatasetClassification(
    train_file, tokenizer, slice(0, 70800), 100
)

val_dataset = LentaRiaPairedDatasetClassification(
    train_file, tokenizer, slice(70000, 73801), 100
)

test_dataset = LentaRiaPairedDatasetClassification(
    train_file, tokenizer, slice(73801, 75362), 100
)

In [12]:
wandb.summary.update({
    'Train dataset size': len(train_dataset),
    'Val dataset size': len(val_dataset),
    'Test dataset size': len(test_dataset),
})

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=2
)

loading configuration file /home/aobuhtijarov/master-thesis/src/bert_ft_on_tg_text/checkpoint-2000/config.json
Model config BertConfig {
  "_name_or_path": "/home/aobuhtijarov/models/rubert_cased_L-12_H-768_A-12_pt/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

loading weights file /home

In [14]:
batch_size = config["batch_size"]
logging_steps = config["logging_steps"]
save_steps = config["save_steps"]
eval_steps = config["eval_steps"]
warmup_steps = config["num_warmup_steps"]
gradient_accumulation_steps = config["gradient_accumulation_steps"]
max_steps = config["max_steps"]
lr = config["learning_rate"]

In [15]:
training_args = TrainingArguments(
    output_dir=output_model_path,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    evaluation_strategy='steps',
    learning_rate=lr,
    warmup_steps=warmup_steps,
    overwrite_output_dir=False,
    logging_steps=logging_steps,
    eval_steps=eval_steps,
    save_steps=save_steps,
    max_steps=max_steps,
    save_total_limit=1,
    weight_decay=0.01,
    report_to='wandb',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

***** Running training *****
  Num examples = 141600
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 64
  Total optimization steps = 5000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


In [None]:
wandb.summary.update({
    'Test Evaluation': trainer.evaluate(eval_dataset=test_dataset)
})
model.save_pretrained(output_model_path)

In [None]:
wandb.finish()