In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import transformers
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,TrainingArguments,Trainer

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import json
import warnings
import random

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

CS_DATA_PATH = PATH + '/data/CS/processed/BABE/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

model_name = 'fav-kky/FERNET-C5'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 64

In [5]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

In [6]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [7]:
subj_path = '/home/horyctom/bias-detection-thesis/src/models/trained/subj_balanced.pth'
mb_path = '/home/horyctom/bias-detection-thesis/src/models/trained/mb_balanced.pth'
wikinpov_path = '/home/horyctom/bias-detection-thesis/src/models/trained/wiki_balanced.pth'
all_path = '/home/horyctom/bias-detection-thesis/src/models/trained/all_balanced.pth'

In [8]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=5e-5)

In [9]:
token_full = preprocess_data(data,tokenizer,'sentence')

In [13]:
MODEL_PATH = subj_path

In [15]:
scores=[]
print("Running 5-fold CV on model: ",model_name,"...")
for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

    token_train = Dataset.from_dict(token_full[train_index])
    token_valid = Dataset.from_dict(token_full[val_index])

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(MODEL_PATH))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

        #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
    
print(np.mean(scores))

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.537
100,0.333


0.781352728468113


Step,Training Loss
50,0.5487
100,0.3243


0.8049844236760124


Step,Training Loss
50,0.5331
100,0.3055


0.8203282464727901


Step,Training Loss
50,0.5515
100,0.3104


0.7531234382808596


Step,Training Loss
50,0.4959
100,0.2966


0.7747152995543819


Step,Training Loss
50,0.5251
100,0.3094


0.7809622135040264


Step,Training Loss
50,0.5248
100,0.3164


0.7944579404619754


Step,Training Loss
50,0.5199
100,0.312


0.7938467891802603


Step,Training Loss
50,0.5222
100,0.336


0.7611619150080688


Step,Training Loss
50,0.5254
100,0.3291


0.7554051823733288
0.7820338176979816


In [18]:
print("SUBJ: ",0.7820338176979816)

SUBJ:  0.7820338176979816


In [16]:
scores=[]
print("Running 5-fold CV on model: ",model_name,"...")
for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

    token_train = Dataset.from_dict(token_full[train_index])
    token_valid = Dataset.from_dict(token_full[val_index])

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(wikinpov_path))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

        #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
    
print(np.mean(scores))

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.485
100,0.2722


0.7656152228547981


Step,Training Loss
50,0.5032
100,0.31


0.8274601878317681


Step,Training Loss
50,0.5327
100,0.2989


0.8104593333951133


Step,Training Loss
50,0.5114
100,0.2991


0.7588569518716579


Step,Training Loss
50,0.4897
100,0.2987


0.78427912448531


Step,Training Loss
50,0.4979
100,0.2778


0.7881481481481482


Step,Training Loss
50,0.5038
100,0.2982


0.7785709142898285


Step,Training Loss
50,0.4845
100,0.2803


0.7747152995543819


Step,Training Loss
50,0.5314
100,0.3454


0.7583019160254093


Step,Training Loss
50,0.5013
100,0.2933


0.7651547178371467
0.7811561816293563


In [19]:
print("WIKI: ",0.7811561816293563)

WIKI:  0.7811561816293563


In [17]:
scores=[]
print("Running 5-fold CV on model: ",model_name,"...")
for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

    token_train = Dataset.from_dict(token_full[train_index])
    token_valid = Dataset.from_dict(token_full[val_index])

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(mb_path))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

        #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
    
print(np.mean(scores))

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.5348
100,0.2909


0.7780210292621257


Step,Training Loss
50,0.5219
100,0.2965


0.8178839381411729


Step,Training Loss
50,0.5268
100,0.2886


0.7943309162821357


Step,Training Loss
50,0.538
100,0.3045


0.7519146264908976


Step,Training Loss
50,0.5435
100,0.2885


0.74734219269103


Step,Training Loss
50,0.5216
100,0.2855


0.7687651830197226


Step,Training Loss
50,0.5609
100,0.3024


0.7618418881003466


Step,Training Loss
50,0.5137
100,0.2998


0.7917831074035453


Step,Training Loss
50,0.5211
100,0.2949


0.7695356797303343


Step,Training Loss
50,0.5336
100,0.2971


0.7657344455530071
0.7747153006674318


In [20]:
print("WIKI: ",0.7747153006674318)

WIKI:  0.7747153006674318


In [10]:
scores=[]
print("Running 5-fold CV on model: ",model_name,"...")
for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

    token_train = Dataset.from_dict(token_full[train_index])
    token_valid = Dataset.from_dict(token_full[val_index])

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.load_state_dict(torch.load(all_path))
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

        #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
    
print(np.mean(scores))

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.4933
100,0.273


0.7782489501298862


Step,Training Loss
50,0.5275
100,0.3202


0.8270788999836307


Step,Training Loss
50,0.5353
100,0.2988


0.8070500927643784


Step,Training Loss
50,0.514
100,0.2838


0.7623117623117623


Step,Training Loss
50,0.5307
100,0.3129


0.771908689339882


Step,Training Loss
50,0.5008
100,0.2876


0.7639866118151768


Step,Training Loss
50,0.4962
100,0.2973


0.7750494396835861


Step,Training Loss
50,0.4779
100,0.2818


0.7847589040249592


Step,Training Loss
50,0.4965
100,0.3331


0.7599700586351728


Step,Training Loss
50,0.4919
100,0.3064


0.7738193869096934
0.7804182795598127
