In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json
import logging
logging.disable(logging.INFO)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
MODELS_PATH = PATH + '/src/models/trained/'

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=16,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

BATCH_SIZE = 16

with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [2]:
def eval_on_target(target,pretrained_name):
    scores = []
    for train_index, val_index in skfold.split(target['input_ids'],target['label']):

        token_train = Dataset.from_dict(target[train_index])
        token_valid = Dataset.from_dict(target[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.load_state_dict(torch.load(MODELS_PATH + pretrained_name + '-pretrain.pth'))
        model.classifier.apply(model._init_weights)
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    return scores

## Load Data

In [2]:
babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']
cw_hard = load_dataset('csv',data_files = PATH + '/data/CS/raw/CW-HARD/cw-hard.csv')['train']
wikibias = load_dataset('csv',data_files = PATH + '/data/CS/raw/WikiBias/wikibias.csv')['train']

datasets = [babe,cwnc,cw_hard,wikibias]
datasets_str = ['babe','cwnc','cw_hard','wikibias']

Using custom data configuration default-f28f8af5b44ab214
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-22b74deea6a13920
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-6a15a08a8984c225
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-6a15a08a8984c225/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Using custom data configuration default-ee602d71f2907259
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-ee602d71f2907259/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [9]:
wiki_sets = concatenate_datasets([cwnc,cw_hard,wikibias]).shuffle(seed=42)

In [41]:
wiki_sets = resample(wiki_sets)

Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-56355b636f669461.arrow
Loading cached shuffled indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-6fd2d253d2337c37.arrow
Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-c57876c030be2602.arrow
Loading cached shuffled indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-ccdc86f930cee896.arrow
Loading cached shuffled indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-22b74deea6a13920/0.0.0/9144e0a4e8435090117ce

## Pretrain WIKI

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#wiki_token = preprocess_data(wiki_sets,tokenizer,'sentence')
babe_tok = preprocess_data(babe,tokenizer,'sentence')

Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-74b152a0ca0f6626.arrow


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args,train_dataset=wiki_token,data_collator=data_collator,tokenizer=tokenizer)
trainer.train()
torch.save(model.state_dict(),MODELS_PATH +'wiki-all-pretrain.pth')

In [46]:
training_args_pretrain = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    eval_steps=100,
    logging_steps=100,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=2,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=2e-5)

In [44]:
wiki_tok = wiki_token.train_test_split(0.2)

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args_pretrain,train_dataset=wiki_tok['train'],data_collator=data_collator,tokenizer=tokenizer,eval_dataset=wiki_tok['test'],
                          compute_metrics=compute_metrics_eval)
trainer.train()
torch.save(model.state_dict(),MODELS_PATH +'wiki-all-pretrain.pth')

Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

Step,Training Loss,Validation Loss,F1
100,0.6929,0.702044,0.671603
200,0.6833,0.704521,0.683694
300,0.6412,0.623977,0.688553
400,0.606,0.592849,0.676387
500,0.6008,0.593543,0.714286
600,0.5956,0.583316,0.669067
700,0.5699,0.575516,0.68277
800,0.563,0.59395,0.699694
900,0.5387,0.5714,0.668653
1000,0.5431,0.585301,0.669774


In [52]:
eval_dataloader = DataLoader(wiki_tok['test'], batch_size=BATCH_SIZE, collate_fn=data_collator)
compute_metrics(model,device,eval_dataloader)['f1']

0.3939688715953308

In [51]:
model.classifier.apply(model._init_weights)

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

## Eval on BABE

In [54]:
eval_on_target(babe_tok,'wiki-all')

  return np.array(array, copy=False, **self.np_array_kwargs)
  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaF

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


[0.7840000000000001,
 0.7424,
 0.7612179487179487,
 0.7836538461538463,
 0.7724358974358975]

In [55]:
scores = [0.7840000000000001,
 0.7424,
 0.7612179487179487,
 0.7836538461538463,
 0.7724358974358975]

In [56]:
np.mean(scores)

0.7687415384615385

In [58]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.load_state_dict(torch.load(MODELS_PATH + 'wiki-all-pretrain.pth'))
#model.classifier.apply(model._init_weights)

Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

<All keys matched successfully>

## Combine BABE with WikiBias

In [9]:
wikibias = resample(wikibias)
wikibias_tok = preprocess_data(wikibias,tokenizer,'sentence')

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [4]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])

    #token_train = concatenate_datasets([wikibias_tok,token_train]).shuffle(seed=42)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])

  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificati

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss


In [5]:
np.mean(scores)

0.784434358974359