In [9]:
import os
import numpy as np
import torch
import torch.nn as nn

from pipeline.bert_classifier_module import BertHateClassifier
from transformers import BertTokenizer, BertForSequenceClassification
import utils.pytorch as pytorch_utils

In [10]:
from utils.helpers import broadcast_list_to_type

In [11]:
a = ["3", "3"]
a = broadcast_list_to_type(a, int)

In [12]:
a

[3, 3]

In [13]:
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-cased-v1")

In [18]:
class HateSpeechDataset(torch.utils.data.HateSpeechDataset):
    
    def __init__(self):
        with open(os.path.join("datafiles", "training_set_clean_only_text.txt"), encoding="utf-8") as f:
            texts = f.readlines()
            self.texts = [text.strip() for text in texts]
        with open(os.path.join("datafiles", "training_set_clean_only_tags.txt"), encoding="utf-8") as f:
            self.labels = f.readlines()
            self.labels = [int(lab.strip()) for lab in self.labels]
        assert len(self.texts) == len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label
    
    def __len__(self):
        return len(self.texts)
    

In [19]:
class DataLoader(torch.utils.data.DataLoader):
    
    def __init__(self):
        dataset = HateSpeechDataset()
        print(len(dataset))
        super(DataLoader, self).__init__(
            dataset=dataset,
            collate_fn=_collate_fn(tokenizer),
            batch_size=32,
            num_workers=1,
            shuffle=True,
            drop_last=False,
        )

In [20]:
def _collate_fn(tokenizer):
    def _make_batch(datapoints) -> dict:
        attention_masks = []
        input_ids = []
        labels = []
        for text, label in datapoints:
            encoding = tokenizer.encode_plus(
                  text,
                  add_special_tokens=True,
                  max_length=512,
                  return_token_type_ids=False,
                  padding="max_length",
                  truncation=True,
                  return_attention_mask=True,
                  return_tensors='pt',
                )
            attention_masks.append(encoding["attention_mask"])
            input_ids.append(encoding["input_ids"])
            labels.append(label)
        batch = {
            "input_ids": torch.cat(input_ids, axis=0),
            "attention_masks": torch.cat(attention_masks, axis=0),
            "targets": torch.from_numpy(np.array(labels))
        }
        return batch
    return _make_batch

In [21]:
dataloader = DataLoader()

10041


In [22]:
model = BertHateClassifier()

Some weights of the model checkpoint at dkleczek/bert-base-polish-cased-v1 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
optimizer = pytorch_utils.create_optimizer(
            params=model.parameters(),
            optimizer_name="sgd",
            init_lr=1e-04,
            weight_decay=0,
        )


lr_scheduler = pytorch_utils.create_lr_scheduler(
    optimizer=optimizer,
    num_iterations=30000,
    gamma=1e-1,
    milestones=[0.4, 0.7, 0.9]
)

In [24]:
encoding = tokenizer.encode_plus(
                  "Jedziemy po zioło",
                  add_special_tokens=True,
                  max_length=512,
                  return_token_type_ids=False,
                  padding="max_length",
                  truncation=True,
                  return_attention_mask=True,
                  return_tensors='pt',
                )

In [None]:
criterion = nn.CrossEntropyLoss()
for batch in dataloader:
    logits = model(batch["input_ids"], batch["attention_masks"])
    optimizer.zero_grad()
    loss = criterion(logits, batch["targets"])
    loss.backward()
    optimizer.step()
    lr_scheduler.step() 
    print(loss)