In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import transformers
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,TrainingArguments,Trainer

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import json
import warnings
import random

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

CS_DATA_PATH = PATH + '/data/CS/processed/BABE/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 64

In [2]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

## Training

In [3]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [4]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=5e-5)

In [6]:
model_scores = {}

for model_name in tqdm(config_data['models']):
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data,tokenizer,'sentence')

    print("Running 5-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    print(np.mean(scores))
    model_scores[model_name] = scores
    

  0%|          | 0/6 [00:00<?, ?it/s]

Running 5-fold CV on model:  UWB-AIR/Czert-B-base-cased ...


Step,Training Loss
50,0.5385
100,0.3262


Step,Training Loss
50,0.5203
100,0.3409


Step,Training Loss
50,0.5235
100,0.3274


Step,Training Loss
50,0.5329
100,0.3045


Step,Training Loss
50,0.5255
100,0.3403


Step,Training Loss
50,0.5271
100,0.3358


Step,Training Loss
50,0.5389
100,0.3474


Step,Training Loss
50,0.5136
100,0.3148


Step,Training Loss
50,0.524
100,0.3287


Step,Training Loss
50,0.5225
100,0.3225


 17%|█▋        | 1/6 [09:00<45:02, 540.42s/it]

Done.
0.7759894788824253
Running 5-fold CV on model:  ufal/robeczech-base ...


Step,Training Loss
50,0.5758
100,0.4197


Step,Training Loss
50,0.569
100,0.4271


Step,Training Loss
50,0.575
100,0.4097


Step,Training Loss
50,0.5758
100,0.3993


Step,Training Loss
50,0.5919
100,0.4468


Step,Training Loss
50,0.6477
100,0.5471


Step,Training Loss
50,0.5849
100,0.4236


Step,Training Loss
50,0.5543
100,0.4221


Step,Training Loss
50,0.5647
100,0.4143


Step,Training Loss
50,0.57
100,0.4371


 33%|███▎      | 2/6 [17:07<33:56, 509.18s/it]

Done.
0.7736152637058488
Running 5-fold CV on model:  bert-base-multilingual-cased ...


Step,Training Loss
50,0.6061
100,0.456


Step,Training Loss
50,0.6196
100,0.4452


Step,Training Loss
50,0.6864
100,0.5511


Step,Training Loss
50,0.636
100,0.4818


Step,Training Loss
50,0.5887
100,0.4141


Step,Training Loss
50,0.5891
100,0.393


Step,Training Loss
50,0.6134
100,0.4478


Step,Training Loss
50,0.6131
100,0.4597


Step,Training Loss
50,0.6166
100,0.4721


Step,Training Loss
50,0.648
100,0.5113


 50%|█████     | 3/6 [27:35<28:09, 563.14s/it]

Done.
0.7335294970781417
Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.5231
100,0.3366


Step,Training Loss
50,0.5482
100,0.3458


Step,Training Loss
50,0.5563
100,0.3384


Step,Training Loss
50,0.5433
100,0.3283


Step,Training Loss
50,0.5603
100,0.3763


Step,Training Loss
50,0.5658
100,0.3794


Step,Training Loss
50,0.5115
100,0.3286


Step,Training Loss
50,0.5257
100,0.3489


Step,Training Loss
50,0.5438
100,0.3509


Step,Training Loss
50,0.5474
100,0.3819


 67%|██████▋   | 4/6 [35:23<17:31, 525.58s/it]

Done.
0.7803352776299648
Running 5-fold CV on model:  fav-kky/FERNET-News ...


Step,Training Loss
50,0.6163
100,0.4184


Step,Training Loss
50,0.7044
100,0.7043


Step,Training Loss
50,0.7127
100,0.7018


Step,Training Loss
50,0.5733
100,0.3811


Step,Training Loss
50,0.7124
100,0.7038


Step,Training Loss
50,0.6916
100,0.7073


Step,Training Loss
50,0.7185
100,0.6986


Step,Training Loss
50,0.6903
100,0.4914


Step,Training Loss
50,0.7117
100,0.7052


Step,Training Loss
50,0.6935
100,0.5884


 83%|████████▎ | 5/6 [43:42<08:36, 516.21s/it]

Done.
0.5659761591194752
Running 5-fold CV on model:  DeepPavlov/bert-base-bg-cs-pl-ru-cased ...


Step,Training Loss
50,0.5683
100,0.3894


Step,Training Loss
50,0.5997
100,0.4242


Step,Training Loss
50,0.5741
100,0.3911


Step,Training Loss
50,0.5879
100,0.3813


Step,Training Loss
50,0.5615
100,0.3984


Step,Training Loss
50,0.5644
100,0.3859


Step,Training Loss
50,0.6053
100,0.4281


Step,Training Loss
50,0.5666
100,0.3898


Step,Training Loss
50,0.5451
100,0.3925


Step,Training Loss
50,0.5558
100,0.3691


100%|██████████| 6/6 [51:40<00:00, 516.68s/it]

Done.
0.753730624380135





In [8]:
for model_name in config_data['models']:
    print(model_name, "score:", np.mean(model_scores[model_name]))

UWB-AIR/Czert-B-base-cased score: 0.7759894788824253
ufal/robeczech-base score: 0.7736152637058488
bert-base-multilingual-cased score: 0.7335294970781417
fav-kky/FERNET-C5 score: 0.7803352776299648
fav-kky/FERNET-News score: 0.5659761591194752
DeepPavlov/bert-base-bg-cs-pl-ru-cased score: 0.753730624380135
