In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
from datasets import load_dataset, Dataset
import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer

from src.utils.myutils import *


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

DATA_PATH = PATH + '/data/CS/processed/WIKI-CS'

model_checkpoint = 'ufal/robeczech-base'
BATCH_SIZE = 32
transformers.utils.logging.set_verbosity_error()


## Data

In [2]:
train = load_dataset('csv',data_files=DATA_PATH+"/train.csv")['train']
babe_cs = load_dataset('csv',data_files=PATH + '/data/CS/processed/BABE/babe_sg2_cs.csv')['train']
test = load_dataset('csv',data_files=DATA_PATH+"/test.csv")['train']

Using custom data configuration default-56c437b671b1e68a
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-56c437b671b1e68a/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-a2670560b441305b
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-a2670560b441305b/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-8ec62ab79f9b4092
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-8ec62ab79f9b4092/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [33]:
train[0]['sentence'][:1100]

'například tehdejší ministr vladimír mlynář ač původem převážně žid na pohřbu biháriové pronášel že se stydí za to že je čech a aniž by soud o případu rozhodl požadoval pro podezřelé co nejpřísnější tresty'

In [35]:
trun_sents = []
for sent in train['sentence']:
    trun_sents.append(sent[:500])

In [43]:
data = Dataset.from_dict({'sentence':trun_sents,'label':train['label']})

In [5]:
babe_cs[5]

{'sentence': '[Demokraté využívají] celý svůj arzenál k delegitimizaci Trumpova prezidentství za zjevný zločin vítězství ve volbách v roce 2016.',
 'label': 1}

## Training

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False,padding=True) #fast tokenizer is buggy in RoBERTa models
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint)
model.to(device);

loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 51961
}

loading weights file https://huggingface.co/ufal/robeczech-base/resolve/m

In [8]:
tokenize = lambda data : tokenizer(data['sentence'], truncation=True)

In [9]:
tokenized_data = babe_cs.map(tokenize,batched=True)
tokenized_data = tokenized_data.remove_columns(['sentence'])
tokenized_data.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [10]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=25,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5)

In [15]:
train_tokenized = preprocess_data(train,tokenizer,'sentence')
test_tokenized = preprocess_data(test,tokenizer,'sentence')
babe_tokenized = preprocess_data(babe_cs,tokenizer,'sentence')

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
tokenized_data[2]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'input_ids': tensor([    0, 37832,   280,   113, 27709,   214,    38, 41785,     6, 16426,
         43225,    96,    90,   264,   523,    68,  2773, 29509,    24,  9585,
           116,     6, 18250,    31,  4380,   482,     9, 13014, 11055,     8,
          4902,    28, 25808,   160,   540, 29833,  7618,     6,  1922,  2824,
           130, 10386,  7618,  1696,   229,    31,     2]),
 'label': tensor(1)}

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
trainer = Trainer(model,training_args,train_dataset=tokenized_data,data_collator=data_collator,
                      tokenizer=tokenizer)
trainer.train()

***** Running training *****
  Num examples = 3673
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1150


Step,Training Loss
25,0.6409
50,0.5222
75,0.5323
100,0.5146
125,0.4807
150,0.4125
175,0.4282
200,0.4072
225,0.4167
250,0.3


KeyboardInterrupt: 

In [20]:
trainer = Trainer(model,training_args,train_dataset=train_tokenized,data_collator=data_collator,tokenizer=tokenizer)
trainer.train()

***** Running training *****
  Num examples = 12368
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3870


Step,Training Loss
25,0.6997
50,0.7028
75,0.7012
100,0.6963
125,0.7128


RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 31.75 GiB total capacity; 22.71 GiB already allocated; 29.00 MiB free; 23.37 GiB reserved in total by PyTorch)

In [21]:
clean_memory()