In [9]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json
import logging
from tqdm import tqdm
import random

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
WNC_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/wncs_pretrained.pth'

BATCH_SIZE = 16
training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=BATCH_SIZE,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

In [8]:
def eval_babe():
    scores = []
    for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

        token_train = Dataset.from_dict(babe_tok[train_index])
        token_valid = Dataset.from_dict(babe_tok[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.load_state_dict(torch.load(WNC_MODEL_PATH))
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    return np.mean(scores)

## Data

In [3]:
data_babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

babe_tok = preprocess_data(data_babe,tokenizer,'sentence')

# Experiment with 5-folds, two random seeds

In [11]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [65]:
np.random.seed(321)
torch.manual_seed(321)   
random.seed(321) 
torch.cuda.manual_seed_all(321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [21]:
eval_babe()

Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


0.77917906249773

In [42]:
print("Final score:",0.77917906249773)

Final score: 0.77917906249773


In [90]:
np.random.seed(54321)
torch.manual_seed(54321)   
random.seed(54321) 
torch.cuda.manual_seed_all(54321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [17]:
eval_babe()

Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


0.7755318533602473

In [43]:
print("Final score:",0.7755318533602473)

Final score: 0.7755318533602473


In [44]:
print("Difference:",round((0.77917906249773-0.7755318533602473)*100,4),"%")

Difference: 0.3647 %


## Experiments with 10-folds, two random seeds

In [24]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [25]:
np.random.seed(321)
torch.manual_seed(321)   
random.seed(321) 
torch.cuda.manual_seed_all(321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [26]:
eval_babe()

Step,Training Loss
500,0.4311


Step,Training Loss
500,0.4576


Step,Training Loss
500,0.4529


Step,Training Loss
500,0.4882


Step,Training Loss
500,0.4538


Step,Training Loss
500,0.4669


Step,Training Loss
500,0.4525


Step,Training Loss
500,0.4511


Step,Training Loss
500,0.4538


Step,Training Loss
500,0.4457


0.7739741423767841

In [34]:
print("Final score:",0.7739741423767841)

Final score: 0.7739741423767841


In [29]:
np.random.seed(54321)
torch.manual_seed(54321)   
random.seed(54321) 
torch.cuda.manual_seed_all(54321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [30]:
eval_babe()

Step,Training Loss
500,0.4467


Step,Training Loss
500,0.4576


Step,Training Loss
500,0.4529


Step,Training Loss
500,0.4882


Step,Training Loss
500,0.4538


Step,Training Loss
500,0.4669


Step,Training Loss
500,0.4525


Step,Training Loss
500,0.4511


Step,Training Loss
500,0.4538


Step,Training Loss
500,0.4457


0.7742732632195288

In [35]:
print("Final score:",0.7742732632195288)

Final score: 0.7742732632195288


In [41]:
print("Difference:",round((0.7742732632195288-0.7739741423767841)*100,4),"%")

Difference: 0.0299 %


# Experiments with 20-fold, two random seeds

In [45]:
skfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

In [46]:
np.random.seed(321)
torch.manual_seed(321)   
random.seed(321) 
torch.cuda.manual_seed_all(321)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [47]:
eval_babe()

Step,Training Loss
500,0.4563


Step,Training Loss
500,0.4739


Step,Training Loss
500,0.4611


Step,Training Loss
500,0.454


Step,Training Loss
500,0.4564


Step,Training Loss
500,0.4634


Step,Training Loss
500,0.4399


Step,Training Loss
500,0.4514


Step,Training Loss
500,0.4519


Step,Training Loss
500,0.4523


Step,Training Loss
500,0.453


Step,Training Loss
500,0.453


Step,Training Loss
500,0.4579


Step,Training Loss
500,0.4529


Step,Training Loss
500,0.5065


Step,Training Loss
500,0.4544


Step,Training Loss
500,0.4483


Step,Training Loss
500,0.4642


KeyboardInterrupt: 

## Train wnc

In [6]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(average='macro',predictions=predictions, references=labels)

In [4]:
model_name = 'ufal/robeczech-base'
WNC_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/wncs_pretrained.pth'
BATCH_SIZE = 64

In [13]:
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,
    eval_steps=1000,
    logging_steps=1000,
    save_steps=2000,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.2,
    output_dir = './',
    learning_rate=1e-5)

In [76]:
#Prep data
data_wnc = load_dataset('csv',data_files = '/home/horyctom/bias-detection-thesis/data/CS/processed/WNC/wnc.csv')['train']
data_wnc = data_wnc.train_test_split(0.1)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train = preprocess_data(data_wnc['train'],tokenizer,'sentence')
test = preprocess_data(data_wnc['test'],tokenizer,'sentence')


#Train

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args,train_dataset=train,data_collator=data_collator,tokenizer=tokenizer,eval_dataset=test,
                          compute_metrics=compute_metrics_eval)

trainer.train()
torch.save(model.state_dict(),WNC_MODEL_PATH)

Step,Training Loss,Validation Loss,F1
1000,0.6587,0.62404,0.621524
2000,0.617,0.602551,0.641175
3000,0.6049,0.598204,0.672542
4000,0.5959,0.601108,0.60532
5000,0.5972,0.588481,0.652214
6000,0.5944,0.586152,0.648411
7000,0.5912,0.587825,0.640611
8000,0.5895,0.58448,0.661348
9000,0.5831,0.582626,0.644682
10000,0.5855,0.582296,0.655734
