In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,\
                            Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import logging
logging.disable(logging.ERROR)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
MODELS_PATH = PATH + '/src/models/trained/'
BATCH_SIZE = 16

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=BATCH_SIZE,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Data Processing

In [2]:
babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']

basil = load_dataset('csv',data_files = PATH + '/data/CS/raw/BASIL/basil.csv')['train']
cw_hard = load_dataset('csv',data_files = PATH + '/data/CS/raw/CW-HARD/cw-hard.csv')['train']
mpqa = load_dataset('csv',data_files = PATH + '/data/CS/raw/MPQA/mpqa.csv')['train']
nfnj = load_dataset('csv',data_files = PATH + '/data/CS/raw/NFNJ/nfnj.csv')['train']
subj = load_dataset('csv',data_files = PATH + '/data/CS/raw/SUBJ/subj.csv')['train']
ua_crisis = load_dataset('csv',data_files = PATH + '/data/CS/raw/UA-crisis/ua-crisis.csv')['train']
wikibias = load_dataset('csv',data_files = PATH + '/data/CS/raw/WikiBias/wikibias.csv')['train']

datasets = [babe,cwnc,basil,cw_hard,mpqa,nfnj,subj,ua_crisis,wikibias]
datasets_str = ['babe','cwnc','basil','cw_hard','mpqa','nfnj','subj','ua_crisis','wikibias']

In [3]:
tokenized = {}
for i in range(len(datasets)):
    print(datasets_str[i])
    tokenized[datasets_str[i]] = preprocess_data(datasets[i],tokenizer,'sentence')

babe
cwnc
basil
cw_hard


  0%|          | 0/5 [00:00<?, ?ba/s]

mpqa


  0%|          | 0/16 [00:00<?, ?ba/s]

nfnj


  0%|          | 0/1 [00:00<?, ?ba/s]

subj


  0%|          | 0/10 [00:00<?, ?ba/s]

ua_crisis


  0%|          | 0/3 [00:00<?, ?ba/s]

wikibias


  0%|          | 0/9 [00:00<?, ?ba/s]

## Eval Babe Baseline with seed

In [4]:
scores = []
for train_index, val_index in skfold.split(tokenized['babe']['input_ids'],tokenized['babe']['label']):
    token_train = Dataset.from_dict(tokenized['babe'][train_index])
    token_valid = Dataset.from_dict(tokenized['babe'][val_index])

    torch.cuda.manual_seed(12345)
    torch.manual_seed(12345)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
    model.to(device)
    
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()

    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print(scores[-1])
print(scores)
print(np.mean(scores))

  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


Step,Training Loss


0.7869009980782344


Step,Training Loss


0.7694805194805194


Step,Training Loss


0.7797558166795913


Step,Training Loss


0.7861863037838367


Step,Training Loss


0.7698396935735565
[0.7869009980782344, 0.7694805194805194, 0.7797558166795913, 0.7861863037838367, 0.7698396935735565]
0.7784326663191476
