In [2]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json
import logging
logging.disable(logging.ERROR)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
SUBJ_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/subj2_pretrained.pth'

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=16,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

BATCH_SIZE = 16
transformers.utils.logging.set_verbosity_error()
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

In [5]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [3]:
data_babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
data_cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']

with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_name = 'ufal/robeczech-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

babe_tok = preprocess_data(data_babe,tokenizer,'sentence')
cwnc_tok = preprocess_data(data_cwnc,tokenizer,'sentence')

# SUBJECTIVE data

In [8]:
subj = load_dataset('csv',data_files=PATH + '/data/CS/raw/SUBJ/subj.csv')['train']
mpqa = load_dataset('csv',data_files=PATH + '/data/CS/raw/MPQA/mpqa.csv')['train']

In [9]:
train = concatenate_datasets([subj,mpqa]).shuffle(seed=42)
subj_tok = preprocess_data(train,tokenizer,'sentence')

  0%|          | 0/26 [00:00<?, ?ba/s]

In [10]:
subj_tok = subj_tok.train_test_split(0.2)

## Pretrain

In [15]:
training_args_pretrain = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    eval_steps=100,
    logging_steps=100,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=4e-5)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args_pretrain,train_dataset=subj_tok['train'],data_collator=data_collator,tokenizer=tokenizer,eval_dataset=subj_tok['test'],
                          compute_metrics=compute_metrics_eval,callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])
trainer.train()
torch.save(model.state_dict(),SUBJ_MODEL_PATH)

Step,Training Loss,Validation Loss,F1
100,0.4833,0.354484,0.853266
200,0.3712,0.357872,0.877238
300,0.3496,0.340647,0.874462
400,0.3302,0.300629,0.895199
500,0.3247,0.283653,0.893452
600,0.3025,0.300223,0.890129


## Eval on BABE

In [4]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])

    torch.cuda.manual_seed(12345)
    torch.manual_seed(12345)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(SUBJ_MODEL_PATH))
    #model.classifier.apply(model._init_weights)

    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])

  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


In [5]:
np.mean(scores)

0.7760311260935445

## Eval on BABE with frozen encoder

In [57]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])


    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(SUBJ_MODEL_PATH))
    #model.classifier.apply(model._init_weights)
    #for name, param in model.named_parameters():
    #    if 'classifier' not in name: # classifier layer
    #        param.requires_grad = False

    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])

  return np.array(array, copy=False, **self.np_array_kwargs)
  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


In [58]:
#not frozen, not initialized
np.mean(scores)

0.7812235897435897

In [56]:
#classifier layer renitialized
np.mean(scores)

0.7799430769230769

In [54]:
#frozen encoder
np.mean(scores)

0.680008717948718

## Eval on CWNC

In [125]:
scores = []
for train_index, val_index in skfold.split(cwnc_tok['input_ids'],cwnc_tok['label']):

    token_train = Dataset.from_dict(cwnc_tok[train_index])
    token_valid = Dataset.from_dict(cwnc_tok[val_index])


    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(SUBJ_MODEL_PATH))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.4169


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.4184


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.4179


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.417


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.4107


[0.7744897959183674, 0.7785714285714285, 0.7540816326530613, 0.753061224489796, 0.746938775510204]
0.7614285714285713


In [126]:
print(scores)
print(np.mean(scores))

[0.7744897959183674, 0.7785714285714285, 0.7540816326530613, 0.753061224489796, 0.746938775510204]
0.7614285714285713


## Train Together

In [81]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])
    
    token_train = concatenate_datasets([token_train,subj_tok]).shuffle(seed=42)
    

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
print(scores)
print(np.mean(scores))

  return np.array(array, copy=False, **self.np_array_kwargs)
Loading cached shuffled indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-9874b5367257af41/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-a4fadf39168fd11b.arrow
***** Running training *****
  Num examples = 28296
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5307


Step,Training Loss
500,0.4499
1000,0.3593
1500,0.3284
2000,0.2915
2500,0.2472
3000,0.2464
3500,0.2414
4000,0.1786
4500,0.1857
5000,0.1641


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpo

Step,Training Loss
500,0.4325
1000,0.3684
1500,0.3308
2000,0.2866
2500,0.2561
3000,0.2513
3500,0.2409
4000,0.1835
4500,0.1831
5000,0.1647


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Deleting older checkpoint [checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./ch

Step,Training Loss
500,0.454
1000,0.3564
1500,0.3343
2000,0.2943
2500,0.2513
3000,0.2501
3500,0.235
4000,0.1825
4500,0.1714
5000,0.1772


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Deleting older checkpoint [checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./ch

Step,Training Loss
500,0.453
1000,0.35
1500,0.3319
2000,0.297
2500,0.2492
3000,0.2478
3500,0.2371
4000,0.1788
4500,0.1713
5000,0.1768


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Deleting older checkpoint [checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./ch

Step,Training Loss
500,0.4505
1000,0.3524
1500,0.3324
2000,0.2924
2500,0.2556
3000,0.2438
3500,0.2364
4000,0.1791
4500,0.1717
5000,0.1725


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Model weights saved in ./checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoint-500/special_tokens_map.json
Deleting older checkpoint [checkpoint-4500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Model weights saved in ./checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./ch

[0.7456, 0.744, 0.7147435897435899, 0.7259615384615384, 0.7211538461538461]
0.7302917948717949


In [83]:
print(scores)
print(np.mean(scores))

[0.7456, 0.744, 0.7147435897435899, 0.7259615384615384, 0.7211538461538461]
0.7302917948717949
