In [None]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json
import logging
logging.disable(logging.ERROR)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
MODELS_PATH = PATH + '/src/models/trained/'

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=16,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

BATCH_SIZE = 16

with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)


In [2]:
import logging
logging.disable(logging.ERROR)

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
wikibias = load_dataset('csv',data_files = PATH + '/data/CS/raw/WikiBias/wikibias.csv')['train']
wikibias = wikibias.shuffle(seed=42)
wikibias = wikibias.train_test_split(0.1)
wiki_train_tok = preprocess_data(wikibias['train'],tokenizer,'sentence')
wiki_test_tok = preprocess_data(wikibias['test'],tokenizer,'sentence')

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## WIKIBIAS

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

model.to(device)
trainer = Trainer(model,training_args,train_dataset=wiki_train_tok,data_collator=data_collator,tokenizer=tokenizer)
trainer.train()

#evaluation
eval_dataloader = DataLoader(wiki_test_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)


Step,Training Loss
500,0.6731
1000,0.6275
1500,0.5753
2000,0.5035
2500,0.4417
3000,0.3828
3500,0.3264
4000,0.2905
4500,0.2559


0.5317073170731708

In [11]:
print(compute_metrics(model,device,eval_dataloader)['f1'])

0.5317073170731708


## CWNC

In [5]:
cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']
cwnc = cwnc.shuffle(seed=42)
cwnc = cwnc.train_test_split(0.1)
cwnc_train_tok = preprocess_data(cwnc['train'],tokenizer,'sentence')
cwnc_test_tok = preprocess_data(cwnc['test'],tokenizer,'sentence')

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
cwnc_test_tok

Dataset({
    features: ['attention_mask', 'input_ids', 'label'],
    num_rows: 490
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

model.to(device)
trainer = Trainer(model,training_args,train_dataset=cwnc_train_tok,data_collator=data_collator,tokenizer=tokenizer)
trainer.train()

#evaluation
eval_dataloader = DataLoader(cwnc_test_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)
print(compute_metrics(model,device,eval_dataloader)['f1'])

Step,Training Loss
500,0.4907


0.7693877551020408


## CW-hard

In [23]:
cwhard = load_dataset('csv',data_files = PATH + '/data/CS/raw/CW-HARD/cw-hard.csv')['train']
cwhard = cwhard.shuffle(seed=42)
cwhard = cwhard.train_test_split(0.1)
cwhard_train_tok = preprocess_data(cwhard['train'],tokenizer,'sentence')
cwhard_test_tok = preprocess_data(cwhard['test'],tokenizer,'sentence')

Using custom data configuration default-6a15a08a8984c225
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-6a15a08a8984c225/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

model.to(device)
trainer = Trainer(model,training_args,train_dataset=cwhard_train_tok,data_collator=data_collator,tokenizer=tokenizer)
trainer.train()

#evaluation
eval_dataloader = DataLoader(cwhard_test_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)
print(compute_metrics(model,device,eval_dataloader)['f1'])

Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

Step,Training Loss
500,0.5289


0.7252525252525251


## Ensemble CWNC a BABE

In [3]:
WNC_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/wnc_cs_pretrained.pth'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
data_babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
babe_tok = preprocess_data(data_babe,tokenizer,'sentence')

Using custom data configuration default-f28f8af5b44ab214
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-c908e7b4b76df975.arrow


In [None]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):
    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])

    torch.cuda.manual_seed(12345)
    torch.manual_seed(12345)
    model1 = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model2.load_state_dict(torch.load(WNC_PATH))
    model1.to(device)
    model2.to(device)
    
    #train BABE
    trainer = Trainer(model1,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    testing_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    
    metric1 = load_metric("f1")

    model.eval()
    for batch in testing_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs1 = model1(**batch)
            output2 = model2(**batch)

            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)  
        metric1.add_batch(predictions=predictions, references=batch["labels"])
        
        
    return scores

In [52]:
model2 = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model2.load_state_dict(torch.load(WNC_PATH))
model2.to(device)

scores=[]
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):
    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])

    torch.cuda.manual_seed(12345)
    torch.manual_seed(12345)
    model1 = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    
    model1.to(device)
    
    trainer = Trainer(model1,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    testing_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    
    metric= load_metric("f1")

    model1.eval()
    model2.eval()
    for batch in testing_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs1 = model1(**batch)
            outputs2 = model2(**batch)

            
        logits1 = outputs1.logits
        logits2 = outputs2.logits
        logits = (logits1 + logits2)/2
        predictions = torch.argmax(logits, dim=-1)  
        metric.add_batch(predictions=predictions, references=batch["labels"])
        
    scores.append(metric.compute(average='macro')['f1'])
    print(scores[-1])

Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

Step,Training Loss


0.7882791327913278


Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

Step,Training Loss


0.7733766050079591


Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ufal/robeczech-base and are newly initialized: 

Step,Training Loss


KeyboardInterrupt: 