In [1]:
import os
import numpy as np
import torch
import torch.nn as nn

from models.bert_classifier_module import BertClassifier
from transformers import BertTokenizer, BertForSequenceClassification
import utils.pytorch as pytorch_utils

In [2]:
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-cased-v1")

In [3]:
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self):
        with open(os.path.join("data", "training_set_clean_only_text.txt"), encoding="utf-8") as f:
            texts = f.readlines()
            self.texts = [text.strip() for text in texts]
        with open(os.path.join("data", "training_set_clean_only_tags.txt"), encoding="utf-8") as f:
            self.labels = f.readlines()
            self.labels = [int(lab.strip()) for lab in self.labels]
        assert len(self.texts) == len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label
    
    def __len__(self):
        return len(self.texts)
    

In [4]:
class DataLoader(torch.utils.data.DataLoader):
    
    def __init__(self):
        dataset = Dataset()
        print(len(dataset))
        super(DataLoader, self).__init__(
            dataset=dataset,
            collate_fn=_collate_fn(tokenizer),
            batch_size=32,
            num_workers=1,
            shuffle=True,
            drop_last=False,
        )

In [5]:
def _collate_fn(tokenizer):
    def _make_batch(datapoints) -> dict:
        attention_masks = []
        input_ids = []
        labels = []
        for text, label in datapoints:
            encoding = tokenizer.encode_plus(
                  text,
                  add_special_tokens=True,
                  max_length=32,
                  return_token_type_ids=False,
                  padding="max_length",
                  truncation=True,
                  return_attention_mask=True,
                  return_tensors='pt',
                )
            attention_masks.append(encoding["attention_mask"])
            input_ids.append(encoding["input_ids"])
            labels.append(label)
        batch = {
            "input_ids": torch.cat(input_ids, axis=0),
            "attention_masks": torch.cat(attention_masks, axis=0),
            "targets": torch.from_numpy(np.array(labels))
        }
        return batch
    return _make_batch

In [6]:
dataloader = DataLoader()

10041


In [7]:
model = BertClassifier()

Some weights of the model checkpoint at dkleczek/bert-base-polish-cased-v1 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
optimizer = pytorch_utils.create_optimizer(
            params=model.parameters(),
            optimizer_name="sgd",
            init_lr=1e-04,
            weight_decay=0,
        )


lr_scheduler = pytorch_utils.create_lr_scheduler(
    optimizer=optimizer,
    num_iterations=30000,
    gamma=1e-1,
    milestones=[0.4, 0.7, 0.9]
)

In [9]:
criterion = nn.CrossEntropyLoss()
for batch in dataloader:
    logits = model(batch["input_ids"], batch["attention_masks"])
    optimizer.zero_grad()
    loss = criterion(logits, batch["targets"])
    loss.backward()
    optimizer.step()
    lr_scheduler.step() 
    print(loss)

tensor(1.1772, grad_fn=<NllLossBackward>)
tensor(1.1899, grad_fn=<NllLossBackward>)
tensor(1.0946, grad_fn=<NllLossBackward>)
tensor(1.1374, grad_fn=<NllLossBackward>)
tensor(1.0217, grad_fn=<NllLossBackward>)
tensor(0.9903, grad_fn=<NllLossBackward>)
tensor(1.0686, grad_fn=<NllLossBackward>)
tensor(1.0074, grad_fn=<NllLossBackward>)
tensor(1.0067, grad_fn=<NllLossBackward>)
tensor(0.9327, grad_fn=<NllLossBackward>)
tensor(0.9420, grad_fn=<NllLossBackward>)
tensor(0.9055, grad_fn=<NllLossBackward>)
tensor(0.9315, grad_fn=<NllLossBackward>)
tensor(0.8928, grad_fn=<NllLossBackward>)
tensor(0.8612, grad_fn=<NllLossBackward>)
tensor(0.8142, grad_fn=<NllLossBackward>)
tensor(0.7982, grad_fn=<NllLossBackward>)
tensor(0.7386, grad_fn=<NllLossBackward>)
tensor(0.7696, grad_fn=<NllLossBackward>)
tensor(0.8120, grad_fn=<NllLossBackward>)
tensor(0.6870, grad_fn=<NllLossBackward>)
tensor(0.7917, grad_fn=<NllLossBackward>)
tensor(0.6912, grad_fn=<NllLossBackward>)
tensor(0.7192, grad_fn=<NllLossBac

tensor(0.3723, grad_fn=<NllLossBackward>)
tensor(0.3814, grad_fn=<NllLossBackward>)
tensor(0.2629, grad_fn=<NllLossBackward>)
tensor(0.3024, grad_fn=<NllLossBackward>)
tensor(0.3144, grad_fn=<NllLossBackward>)
tensor(0.4525, grad_fn=<NllLossBackward>)
tensor(0.3006, grad_fn=<NllLossBackward>)
tensor(0.2998, grad_fn=<NllLossBackward>)
tensor(0.4685, grad_fn=<NllLossBackward>)
tensor(0.4205, grad_fn=<NllLossBackward>)
tensor(0.3987, grad_fn=<NllLossBackward>)
tensor(0.2178, grad_fn=<NllLossBackward>)
tensor(0.2945, grad_fn=<NllLossBackward>)
tensor(0.1466, grad_fn=<NllLossBackward>)
tensor(0.2347, grad_fn=<NllLossBackward>)
tensor(0.2957, grad_fn=<NllLossBackward>)
tensor(0.4613, grad_fn=<NllLossBackward>)
tensor(0.3639, grad_fn=<NllLossBackward>)
tensor(0.5054, grad_fn=<NllLossBackward>)
tensor(0.2741, grad_fn=<NllLossBackward>)
tensor(0.2166, grad_fn=<NllLossBackward>)
tensor(0.4018, grad_fn=<NllLossBackward>)
tensor(0.2107, grad_fn=<NllLossBackward>)
tensor(0.3629, grad_fn=<NllLossBac