In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import transformers
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,TrainingArguments,Trainer

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import json
import warnings
import random

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

CS_DATA_PATH = PATH + '/data/CS/processed/BABE/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

model_name = 'fav-kky/FERNET-C5'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 32

KeyboardInterrupt: 

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

In [16]:
def eval_on_babe(MODEL_PATH,to_train):
    scores=[]
    print("Running 5-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        if MODEL_PATH is not None:
            print("tuning pretrained")
            model.load_state_dict(torch.load(MODEL_PATH))
        model.to(device)
        if to_train:
            trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
            trainer.train()

            #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        print(scores[-1])

    return np.mean(scores)

In [3]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

In [4]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [5]:
subj_path = '/home/horyctom/bias-detection-thesis/src/models/trained/subj_balanced.pth'
mb_path = '/home/horyctom/bias-detection-thesis/src/models/trained/mb_balanced.pth'
wikinpov_path = '/home/horyctom/bias-detection-thesis/src/models/trained/wiki_balanced.pth'
all_path = '/home/horyctom/bias-detection-thesis/src/models/trained/all_balanced.pth'
wncs_path = '/home/horyctom/bias-detection-thesis/src/models/trained/wncs_balanced.pth'
all_wo_mb_path = '/home/horyctom/bias-detection-thesis/src/models/trained/all_balanced_wo_mb.pth'

In [6]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=2e-5)

In [7]:
token_full = preprocess_data(data,tokenizer,'sentence')

## Vanilla

In [19]:
eval_on_babe(None)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.5916
100,0.4868
150,0.3948
200,0.3563
250,0.2928


0.7749219263642341


Step,Training Loss
50,0.5875
100,0.5105
150,0.434
200,0.3918
250,0.3127


0.817824430988533


Step,Training Loss
50,0.6059
100,0.5155
150,0.4214
200,0.3908
250,0.3419


0.8194444444444444


Step,Training Loss
50,0.6151
100,0.5259
150,0.4501
200,0.3766
250,0.3495


0.7596201899050474


Step,Training Loss
50,0.5869
100,0.4723
150,0.3813
200,0.3489
250,0.2763


0.7524702488279842


Step,Training Loss
50,0.5663
100,0.4867
150,0.3875
200,0.347
250,0.2915


0.7862126245847176


Step,Training Loss
50,0.6226
100,0.4887
150,0.425
200,0.3954
250,0.3512


0.7686222808174027


Step,Training Loss
50,0.5805
100,0.481
150,0.3998
200,0.3866
250,0.3042


0.7670772676371781


Step,Training Loss
50,0.5913
100,0.5098
150,0.4189
200,0.3868
250,0.3219


0.7900947057910263


Step,Training Loss
50,0.5663
100,0.4765
150,0.3716
200,0.3493
250,0.2864


0.7776377627433766


0.7813925882103944

In [20]:
print(0.7813925882103944)

0.7813925882103944


## SUBJ

In [15]:
eval_on_babe(subj_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.5475
100,0.4742
150,0.3739
200,0.3262
250,0.2791


0.7684911242603552


Step,Training Loss
50,0.5878
100,0.4913
150,0.3974
200,0.3715
250,0.2867


0.8115015974440895


Step,Training Loss
50,0.5775
100,0.4722
150,0.3835
200,0.3403
250,0.2802


0.8265738872647919


Step,Training Loss
50,0.5525
100,0.478
150,0.3824
200,0.2988
250,0.2676


0.7657559958289886


Step,Training Loss
50,0.5478
100,0.4544
150,0.3678
200,0.3369
250,0.2707


0.7778396953654685


Step,Training Loss
50,0.543
100,0.4824
150,0.3742
200,0.3317
250,0.2737


0.7877551020408162


Step,Training Loss
50,0.5591
100,0.4579
150,0.3569
200,0.3375
250,0.273


0.8006184291898578


Step,Training Loss
50,0.558
100,0.4498
150,0.3628
200,0.3461
250,0.2618


0.7874045013421432


Step,Training Loss
50,0.5559
100,0.4805
150,0.3734
200,0.3521
250,0.2817


0.7769188522636601


Step,Training Loss
50,0.5534
100,0.4578
150,0.3646
200,0.3374
250,0.2782


0.7747152995543819


0.7877574484554553

In [17]:
print(0.7877574484554553)

0.7877574484554553


## WIKI

In [21]:
eval_on_babe(wikinpov_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...
tuning pretrained


Step,Training Loss
50,0.5052
100,0.4554
150,0.3547
200,0.3149
250,0.2545


0.7722329838370794
tuning pretrained


Step,Training Loss
50,0.5327
100,0.4648
150,0.374
200,0.3489
250,0.2649


0.8178616275152368
tuning pretrained


Step,Training Loss
50,0.5565
100,0.4595
150,0.3659
200,0.326
250,0.2688


0.8199134199134199
tuning pretrained


Step,Training Loss
50,0.5343
100,0.4617
150,0.3526
200,0.2842
250,0.2562


0.7596201899050474
tuning pretrained


Step,Training Loss
50,0.5325
100,0.4427
150,0.3569
200,0.318
250,0.2513


0.774891774891775
tuning pretrained


Step,Training Loss
50,0.512
100,0.4644
150,0.3659
200,0.3245
250,0.2699


0.7687651830197226
tuning pretrained


Step,Training Loss
50,0.5319
100,0.4475
150,0.3349
200,0.3097
250,0.2571


0.7756041426927502
tuning pretrained


Step,Training Loss
50,0.531
100,0.4418
150,0.3469
200,0.3385
250,0.2586


0.78427912448531
tuning pretrained


Step,Training Loss
50,0.504
100,0.4721
150,0.369
200,0.3444
250,0.2738


0.7704527320394184
tuning pretrained


Step,Training Loss
50,0.5285
100,0.4562
150,0.3652
200,0.3262
250,0.2666


0.7976091187100363


0.7841230297009795

In [22]:
print(0.7841230297009795)

0.7841230297009795


## MB

In [23]:
eval_on_babe(mb_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...
tuning pretrained


Step,Training Loss
50,0.5356
100,0.4668
150,0.3486
200,0.2958
250,0.2355


0.7575222335843326
tuning pretrained


Step,Training Loss
50,0.5223
100,0.4877
150,0.3649
200,0.334
250,0.2494


0.7950564740546733
tuning pretrained


Step,Training Loss
50,0.5387
100,0.4828
150,0.351
200,0.3099
250,0.2536


0.7910463139457009
tuning pretrained


Step,Training Loss
50,0.5371
100,0.4693
150,0.3593
200,0.2854
250,0.2563


0.7680297397769518
tuning pretrained


Step,Training Loss
50,0.578
100,0.4474
150,0.3492
200,0.3184
250,0.2444


0.7322552962706397
tuning pretrained


Step,Training Loss
50,0.5476
100,0.4704
150,0.3607
200,0.3108
250,0.2639


0.7747152995543819
tuning pretrained


Step,Training Loss
50,0.5581
100,0.4578
150,0.3524
200,0.3151
250,0.2456


0.7676169983862292
tuning pretrained


Step,Training Loss
50,0.5414
100,0.4491
150,0.3486
200,0.3281
250,0.2406


0.7859192414854244
tuning pretrained


Step,Training Loss
50,0.5316
100,0.4837
150,0.3775
200,0.3262
250,0.2637


0.7695356797303343
tuning pretrained


Step,Training Loss
50,0.5396
100,0.4734
150,0.3451
200,0.3137
250,0.2417


0.7616353499896757


0.7703332626778344

In [24]:
print(0.7703332626778344)

0.7703332626778344


## All

In [25]:
eval_on_babe(all_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...
tuning pretrained


Step,Training Loss
50,0.5221
100,0.4692
150,0.3467
200,0.3061
250,0.2522


0.7848797250859105
tuning pretrained


Step,Training Loss
50,0.5397
100,0.4776
150,0.3725
200,0.3529
250,0.2689


0.8337826797385621
tuning pretrained


Step,Training Loss
50,0.5617
100,0.4619
150,0.3567
200,0.3149
250,0.2476


0.8199134199134199
tuning pretrained


Step,Training Loss
50,0.5206
100,0.4635
150,0.3598
200,0.2948
250,0.2531


0.7549609927221321
tuning pretrained


Step,Training Loss
50,0.5524
100,0.4423
150,0.3548
200,0.32
250,0.2503


0.7781876255731287
tuning pretrained


Step,Training Loss
50,0.5236
100,0.4592
150,0.3422
200,0.3101
250,0.2664


0.7714002662456271
tuning pretrained


Step,Training Loss
50,0.5457
100,0.4556
150,0.3501
200,0.321
250,0.2479


0.7880347511014124
tuning pretrained


Step,Training Loss
50,0.5334
100,0.4377
150,0.3513
200,0.3243
250,0.2567


0.781611561740849
tuning pretrained


Step,Training Loss
50,0.5125
100,0.4753
150,0.3575
200,0.3351
250,0.2508


0.7636927673967402
tuning pretrained


Step,Training Loss
50,0.525
100,0.4482
150,0.3603
200,0.3227
250,0.2615


0.7653289372005565


0.784179272671834

In [26]:
print(0.784179272671834)

0.784179272671834


## WNCS

In [27]:
eval_on_babe(wncs_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...
tuning pretrained


Step,Training Loss
50,0.5284
100,0.4545
150,0.3608
200,0.3204
250,0.2817


0.7663056262976486
tuning pretrained


Step,Training Loss
50,0.536
100,0.473
150,0.3853
200,0.3409
250,0.2715


0.8144673370942687
tuning pretrained


Step,Training Loss
50,0.5509
100,0.4612
150,0.3736
200,0.3341
250,0.2748


0.8168844407376518
tuning pretrained


Step,Training Loss
50,0.525
100,0.4772
150,0.3634
200,0.2917
250,0.2707


0.75348276292261
tuning pretrained


Step,Training Loss
50,0.5501
100,0.4527
150,0.3694
200,0.3368
250,0.2726


0.7714002662456271
tuning pretrained


Step,Training Loss
50,0.5285
100,0.4725
150,0.3671
200,0.314
250,0.2818


0.7756041426927502
tuning pretrained


Step,Training Loss
50,0.5584
100,0.4538
150,0.3791
200,0.3398
250,0.2898


0.7948633655229094
tuning pretrained


Step,Training Loss
50,0.5315
100,0.4499
150,0.3658
200,0.3439
250,0.2777


0.7805271651425497
tuning pretrained


Step,Training Loss
50,0.5372
100,0.4654
150,0.3767
200,0.3485
250,0.2899


0.7673570836785418
tuning pretrained


Step,Training Loss
50,0.5488
100,0.4608
150,0.3749
200,0.3226
250,0.2784


0.7743055555555556


0.7815197745890112

In [28]:
print(0.7815197745890112)

0.7815197745890112


## All wo MB

In [8]:
eval_on_babe(all_wo_mb_path)

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...
tuning pretrained


Step,Training Loss
50,0.518
100,0.4595
150,0.3526
200,0.308
250,0.2507


0.7724102616621435
tuning pretrained


Step,Training Loss
50,0.5703
100,0.4642
150,0.3741
200,0.3406
250,0.2677


0.8080696590630365
tuning pretrained


Step,Training Loss
50,0.5615
100,0.4539
150,0.368
200,0.3183
250,0.2602


0.8201506979042286
tuning pretrained


Step,Training Loss
50,0.534
100,0.4513
150,0.3609
200,0.2922
250,0.2543


0.7561865069028393
tuning pretrained


Step,Training Loss
50,0.532
100,0.442
150,0.3526
200,0.3274
250,0.258


0.7809622135040264
tuning pretrained


Step,Training Loss
50,0.5252
100,0.4575
150,0.358
200,0.325
250,0.2676


0.7807539682539681
tuning pretrained


Step,Training Loss
50,0.5323
100,0.4499
150,0.3545
200,0.3229
250,0.268


0.7847589040249592
tuning pretrained


Step,Training Loss
50,0.5362
100,0.4453
150,0.3544
200,0.3356
250,0.2569


0.7943309162821357
tuning pretrained


Step,Training Loss
50,0.5176
100,0.474
150,0.3654
200,0.3472
250,0.2664


0.7875887110084172
tuning pretrained


Step,Training Loss
50,0.5233
100,0.4492
150,0.3541
200,0.3153
250,0.2554


0.7588995930142703


0.7844111431620024

In [9]:
print(0.7844111431620024)

0.7844111431620024


## Eval all on BABE not fine-tuning

In [18]:
eval_on_babe(subj_path,False)

0.5542269457401406


In [20]:
eval_on_babe(wikinpov_path,False)

0.634449005739866


In [22]:
eval_on_babe(mb_path,False)

0.46307524184646615


In [24]:
eval_on_babe(all_path,False)

0.6422974189680117


In [26]:
eval_on_babe(wncs_path,False)

0.6697388712101715


In [28]:
eval_on_babe(all_wo_mb_path,False)

0.5280019026747172
