In [1]:
import transformers
import torch

model_str = "ufal/robeczech-base"

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_str)


In [4]:
config = AutoConfig.from_pretrained(model_str,
    num_labels=7,
    output_hidden_states=True
)
model = AutoModelForSequenceClassification.\
    from_pretrained(model_str, config=config).to("cuda:0")

Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

In [5]:
from datasets import load_dataset
dataset = load_dataset("pandas", data_files="words_df.pkl")

def tokenize_datapoint(datapoint):
    return tokenizer(datapoint["word"], padding=True)

tokenized_dataset = dataset.map(tokenize_datapoint, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("word")

tokenized_dataset = tokenized_dataset["train"].train_test_split(train_size=0.8)
tokenized_dataset = tokenized_dataset.rename_column("cases", "labels")

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Using custom data configuration default-300a50ec4e320bb6
Reusing dataset pandas (/home/jacob/.cache/huggingface/datasets/pandas/default-300a50ec4e320bb6/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [19]:
from transformers import TrainingArguments
training_args = TrainingArguments("czech-finetune", num_train_epochs=6)

from transformers import Trainer, PreTrainedModel
import torch
import os

def unwrap_model(model):
    """
    Recursively unwraps a model from potential containers (as used in distributed training).

    Args:
        model (`torch.nn.Module`): The model to unwrap.
    """
    # since there could be multiple levels of wrapping, unwrap recursively
    if hasattr(model, "module"):
        return unwrap_model(model.module)
    else:
        return model

# custom trainer to avoid batch size error
# shoutouts to https://discuss.huggingface.co/t/1653/2
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
        )
        labels = inputs['labels'].float()
        loss = torch.nn.BCEWithLogitsLoss()(outputs['logits'], labels)
        return (loss, outputs) if return_outputs else loss
    def _save(self, output_dir=None, state_dict=None):
        # If we are executing this function, we are the process zero, so we don't check for that.
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        print(f"Saving model checkpoint to {output_dir}")
        # Save a trained model and configuration using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        self.model.save_pretrained(output_dir, state_dict=state_dict)
        #if self.tokenizer is not None:
        #`   self.tokenizer.save_pretrained(output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))

trainer = CustomTrainer(model, training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train("./czech-finetune/checkpoint-2000")

Loading model from ./czech-finetune/checkpoint-2000).
***** Running training *****
  Num examples = 5509
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4134
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 2000
  Will skip the first 2 epochs then the first 622 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/622 [00:00<?, ?it/s]

Step,Training Loss
2500,0.0747
3000,0.0641


Configuration saved in czech-finetune/checkpoint-2500/config.json


Saving model checkpoint to czech-finetune/checkpoint-2500


Model weights saved in czech-finetune/checkpoint-2500/pytorch_model.bin
Configuration saved in czech-finetune/checkpoint-3000/config.json


Saving model checkpoint to czech-finetune/checkpoint-3000


Model weights saved in czech-finetune/checkpoint-3000/pytorch_model.bin


In [15]:
logits = model(**tokenizer("den", return_tensors="pt").to("cuda:0"))["logits"]
import torch
torch.nn.functional.softmax(logits)

  This is separate from the ipykernel package so we can avoid doing imports until


tensor([[5.9616e-01, 1.1494e-04, 5.1304e-05, 4.0341e-01, 1.3781e-04, 7.5256e-05,
         4.7899e-05]], device='cuda:0', grad_fn=<SoftmaxBackward>)