In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
MB_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/mb_pretrained.pth'

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=16,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

BATCH_SIZE = 16

In [2]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [3]:
import logging
logging.disable(logging.ERROR)

In [4]:
data_babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
data_cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']

nfnj = load_dataset('csv',data_files=PATH + '/data/CS/raw/NFNJ/nfnj.csv')['train']
ua_crisis = load_dataset('csv',data_files=PATH + '/data/CS/raw/UA-crisis/ua-crisis.csv')['train']
basil = load_dataset('csv',data_files=PATH + '/data/CS/raw/BASIL/basil.csv')['train']


with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_name = 'ufal/robeczech-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

babe_tok = preprocess_data(data_babe,tokenizer,'sentence')
cwnc_tok = preprocess_data(data_cwnc,tokenizer,'sentence')

### Preprocessing - downsampling

In [5]:
mb = concatenate_datasets([nfnj,ua_crisis,basil])

In [6]:
mb = resample(mb)
mb_tok = preprocess_data(mb,tokenizer,'sentence')

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
mb_tok = mb_tok.train_test_split(0.2,seed=42)

## Pretrain

In [28]:
training_args_pretrain = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    eval_steps=50,
    logging_steps=50,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.2,
    output_dir = './',
    learning_rate=4e-5)

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args_pretrain,train_dataset=mb_tok['train'],data_collator=data_collator,tokenizer=tokenizer,eval_dataset=mb_tok['test'],
                          compute_metrics=compute_metrics_eval,callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
trainer.train()
torch.save(model.state_dict(),MB_MODEL_PATH) 

Step,Training Loss,Validation Loss,F1
50,0.6936,0.688426,0.639416
100,0.6936,0.672543,0.659476
150,0.6761,0.614739,0.638116
200,0.6335,0.622369,0.650307
250,0.5743,0.661661,0.691652
300,0.5101,0.634443,0.698361
350,0.4235,0.700325,0.700758
400,0.375,0.721681,0.721934
450,0.3222,0.731672,0.689379
500,0.2051,0.914361,0.710572


## Eval on BABE

In [5]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])


    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(MB_MODEL_PATH))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
print(scores)
print(np.mean(scores))

  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


[0.7472, 0.7712, 0.7676282051282052, 0.7612179487179487, 0.7596153846153846]
0.7613723076923077


## Eval on CWNC

In [110]:
scores = []
for train_index, val_index in skfold.split(cwnc_tok['input_ids'],cwnc_tok['label']):

    token_train = Dataset.from_dict(cwnc_tok[train_index])
    token_valid = Dataset.from_dict(cwnc_tok[val_index])


    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(MB_MODEL_PATH))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        


  return np.array(array, copy=False, **self.np_array_kwargs)
  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaF

Step,Training Loss
500,0.4638


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss
500,0.4641


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss
500,0.4693


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss
500,0.4722


  return np.array(array, copy=False, **self.np_array_kwargs)
Some weights of the model checkpoint at ufal/robeczech-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model 

Step,Training Loss
500,0.4667


In [112]:
np.mean(scores)

0.7504081632653061

In [None]:
mb_tok

## Train together

In [8]:
scores = []
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])
    
    token_train = concatenate_datasets([token_train,mb_tok]).shuffle(seed=42)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
print(scores)
print(np.mean(scores))

  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.6112


0.7744000000000001


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.6679


0.7568


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss
500,0.596


0.7580128205128205


  return np.array(array, copy=False, **self.np_array_kwargs)


Step,Training Loss


KeyboardInterrupt: 