In [25]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from datasets import load_dataset, Dataset, concatenate_datasets
import transformers
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,TrainingArguments,Trainer

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import json
import warnings
import random
import pandas as pd

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

CS_DATA_PATH = PATH + '/data/CS/processed/'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = 'fav-kky/FERNET-C5'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

BATCH_SIZE = 32

In [4]:
babe = load_dataset('csv',data_files=CS_DATA_PATH+'BABE/train.csv')['train']

cw_hard = load_dataset('csv',data_files = CS_DATA_PATH + 'CW-HARD/cw-hard.csv')['train']
cwnc = load_dataset('csv',data_files = CS_DATA_PATH + 'CWNC/cwnc.csv')['train']
wikibias = load_dataset('csv',data_files = CS_DATA_PATH + 'WikiBias/wikibias.csv')['train']
basil = load_dataset('csv',data_files = CS_DATA_PATH + 'BASIL/basil.csv')['train']
nfnj = load_dataset('csv',data_files = CS_DATA_PATH + 'NFNJ/nfnj.csv')['train']
ua_crisis = load_dataset('csv',data_files = CS_DATA_PATH + 'UA-crisis/ua-crisis.csv')['train']
mpqa = load_dataset('csv',data_files = CS_DATA_PATH + 'MPQA/mpqa.csv')['train']
subj = load_dataset('csv',data_files = CS_DATA_PATH + 'SUBJ/subj.csv')['train']

In [51]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=3e-5)

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);


In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);

In [12]:
model.load_state_dict(torch.load('/home/horyctom/bias-detection-thesis/src/models/trained/all_balanced.pth'))

<All keys matched successfully>

In [14]:
all_ = concatenate_datasets([cw_hard,cwnc,wikibias,resample(basil),resample(nfnj),resample(ua_crisis),mpqa,subj]).shuffle(seed=42)

In [17]:
all_tok = preprocess_data(all_,tokenizer,'sentence')

  0%|          | 0/47 [00:00<?, ?ba/s]

In [40]:
unlabelled_dataloader = DataLoader(all_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)
logits = torch.Tensor().to(device)

In [41]:
model.eval()
model.to(device)
for batch in tqdm(unlabelled_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = torch.cat((logits,F.softmax(outputs.logits)))

100%|██████████| 1444/1444 [01:04<00:00, 22.52it/s]


In [44]:
unbiased_topk_indices = torch.topk(logits[:,0],k)[1]
biased_topk_indices = torch.topk(logits[:,1],k)[1]
indices = torch.cat((unbiased_topk_indices,biased_topk_indices)).cpu()

In [46]:
indices

tensor([25772, 20059, 41556,  ..., 15036,  7623, 12238])

In [48]:
#create new augmentation and concat it
masks = all_tok[indices]['attention_mask']
input_ids = all_tok[indices]['input_ids']
labels = [0]*len(unbiased_topk_indices) + [1]*len(biased_topk_indices)
token_type_ids = all_tok[indices]['token_type_ids']
to_add = Dataset.from_dict({'attention_mask':masks,'input_ids':input_ids,'label':labels,'token_type_ids':token_type_ids})

In [53]:
to_add = preprocess_data(subj,tokenizer,'sentence')

  0%|          | 0/10 [00:00<?, ?ba/s]

In [64]:
subj_small = Dataset.from_dict(subj[:2500])
to_add = preprocess_data(subj_small,tokenizer,'sentence')

  0%|          | 0/3 [00:00<?, ?ba/s]

In [65]:
scores=[]
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
babe_tok = preprocess_data(babe,tokenizer,'sentence')
print("Running 10-fold CV on model: ",model_name,"...")
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])

    token_train = concatenate_datasets([token_train,to_add])
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.to(device)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])

print(np.mean(scores))

Running 10-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.4636
100,0.3634
150,0.3538
200,0.2444
250,0.2196
300,0.2096
350,0.1682
400,0.096
450,0.1204


0.7853816789987003


Step,Training Loss
50,0.4852
100,0.3865
150,0.3968
200,0.2877
250,0.2727
300,0.2591
350,0.2371
400,0.1751
450,0.1644


0.788651988868882


Step,Training Loss
50,0.5015
100,0.4124
150,0.3703
200,0.3098
250,0.2769
300,0.2288
350,0.1979
400,0.1464
450,0.1227


0.7903204127420673


Step,Training Loss
50,0.493
100,0.3879
150,0.3626
200,0.2541
250,0.2567
300,0.2236
350,0.2068
400,0.1092
450,0.1133


0.7740720817643895


Step,Training Loss
50,0.4576
100,0.4107
150,0.3967
200,0.266
250,0.2456
300,0.2366
350,0.1802
400,0.1221
450,0.129


0.7611619150080688


Step,Training Loss
50,0.4417
100,0.3919
150,0.3758
200,0.2779
250,0.2318
300,0.2354
350,0.1743
400,0.1149
450,0.135


0.7647471982647318


KeyboardInterrupt: 