In [12]:
# Making imports inside the project convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

In [14]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,TrainingArguments,Trainer

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import warnings

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning) 

CS_DATA_PATH = PATH + '/data/CS/processed/'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 64

In [15]:
data = load_dataset('csv',data_files = CS_DATA_PATH + 'BABE/train.csv')['train']
data_cwnc = load_dataset('csv',data_files = CS_DATA_PATH + 'CWNC/cwnc.csv')['train']

with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

## BASELINE on BABE

Evaluate all czech models on babe dataset using the same hyperparameters.
  - UWB-AIR/Czert-B-base-cased
  - ufal/robeczech-base
  - bert-base-multilingual-cased
  - fav-kky/FERNET-C5
  - fav-kky/FERNET-News
  - DeepPavlov/bert-base-bg-cs-pl-ru-cased

In [8]:
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
#same args as in original paper
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=5e-5)

In [None]:
model_scores = {}

for model_name in tqdm(config_data['models']):
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data,tokenizer,'sentence')

    print("Running 10-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    print(np.mean(scores))
    model_scores[model_name] = scores

In [8]:
for model_name in config_data['models']:
    print(model_name, "score:", np.mean(model_scores[model_name]))

UWB-AIR/Czert-B-base-cased score: 0.7759894788824253
ufal/robeczech-base score: 0.7736152637058488
bert-base-multilingual-cased score: 0.7335294970781417
fav-kky/FERNET-C5 score: 0.7803352776299648
fav-kky/FERNET-News score: 0.5659761591194752
DeepPavlov/bert-base-bg-cs-pl-ru-cased score: 0.753730624380135


## BASELINE on CWNC

In [10]:
model_scores = {}

for model_name in tqdm(config_data['models']):
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data_cwnc,tokenizer,'sentence')

    print("Running 10-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    print(np.mean(scores))
    model_scores[model_name] = scores

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  UWB-AIR/Czert-B-base-cased ...


Step,Training Loss
50,0.6008
100,0.4115
150,0.386
200,0.3546


Step,Training Loss
50,0.5483
100,0.423
150,0.3783
200,0.3294


Step,Training Loss
50,0.5252
100,0.4133
150,0.379
200,0.3299


Step,Training Loss
50,0.5337
100,0.4195
150,0.385
200,0.3432


Step,Training Loss
50,0.567
100,0.4155
150,0.3696
200,0.3588


Step,Training Loss
50,0.6028
100,0.4245
150,0.4039
200,0.3556


Step,Training Loss
50,0.5724
100,0.4245
150,0.3824
200,0.3499


Step,Training Loss
50,0.5773
100,0.41
150,0.3843
200,0.333


Step,Training Loss
50,0.5842
100,0.4391
150,0.3956
200,0.3436


Step,Training Loss
50,0.5605
100,0.4297
150,0.3798
200,0.3512


 17%|█▋        | 1/6 [15:57<1:19:49, 957.81s/it]

Done.
0.731938680930234


  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  ufal/robeczech-base ...


Step,Training Loss
50,0.6955
100,0.6967
150,0.6948
200,0.6934


Step,Training Loss
50,0.6743
100,0.4579
150,0.4028
200,0.3584


Step,Training Loss
50,0.6974
100,0.6945
150,0.6964
200,0.694


Step,Training Loss
50,0.6288
100,0.4537
150,0.4159
200,0.3733


Step,Training Loss
50,0.6808
100,0.4558
150,0.4097
200,0.3963


Step,Training Loss
50,0.6962
100,0.4925
150,0.4289
200,0.3893


Step,Training Loss
50,0.6365
100,0.4595
150,0.4278
200,0.3867


Step,Training Loss
50,0.6668
100,0.4521
150,0.4145
200,0.3646


Step,Training Loss
50,0.695
100,0.52
150,0.4313
200,0.3924


Step,Training Loss
50,0.6974
100,0.571
150,0.4446
200,0.413


 33%|███▎      | 2/6 [31:50<1:03:38, 954.66s/it]

Done.
0.7094267047656141


  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  bert-base-multilingual-cased ...


Step,Training Loss
50,0.5629
100,0.431
150,0.3935
200,0.3835


Step,Training Loss
50,0.7031
100,0.5957
150,0.4711
200,0.4136


Step,Training Loss
50,0.5677
100,0.4287
150,0.4068
200,0.364


Step,Training Loss
50,0.5257
100,0.4641
150,0.4275
200,0.3844


Step,Training Loss
50,0.5643
100,0.4315
150,0.393
200,0.387


Step,Training Loss
50,0.5338
100,0.4483
150,0.4303
200,0.3898


Step,Training Loss
50,0.5808
100,0.4438
150,0.4206
200,0.3774


Step,Training Loss
50,0.5672
100,0.4468
150,0.4391
200,0.3877


Step,Training Loss
50,0.6022
100,0.4532
150,0.4261
200,0.3778


Step,Training Loss
50,0.5759
100,0.4477
150,0.3995
200,0.3716


 50%|█████     | 3/6 [50:18<51:14, 1024.68s/it] 

Done.
0.7344790587363879


  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.6904
100,0.4488
150,0.4055
200,0.3712


Step,Training Loss
50,0.579
100,0.423
150,0.3792
200,0.3346


Step,Training Loss
50,0.5416
100,0.4061
150,0.3919
200,0.3389


Step,Training Loss
50,0.6329
100,0.4376
150,0.4183
200,0.3637


Step,Training Loss
50,0.5664
100,0.4144
150,0.3823
200,0.351


Step,Training Loss
50,0.7117
100,0.6978
150,0.6884
200,0.4741


Step,Training Loss
50,0.6713
100,0.4599
150,0.414
200,0.3701


Step,Training Loss
50,0.6521
100,0.4343
150,0.4227
200,0.363


Step,Training Loss
50,0.5836
100,0.4376
150,0.4037
200,0.3501


Step,Training Loss
50,0.7014
100,0.4806
150,0.4045
200,0.3726


 67%|██████▋   | 4/6 [1:04:47<32:06, 963.31s/it]

Done.
0.7469339088937158


  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  fav-kky/FERNET-News ...


Step,Training Loss
50,0.7146
100,0.7034
150,0.7027
200,0.6982


Step,Training Loss
50,0.7123
100,0.7017
150,0.7011
200,0.6997


Step,Training Loss
50,0.7174
100,0.7089
150,0.7036
200,0.7007


Step,Training Loss
50,0.7236
100,0.701
150,0.7074
200,0.7029


Step,Training Loss
50,0.7293
100,0.7072
150,0.6494
200,0.4855


Step,Training Loss
50,0.7165
100,0.7039
150,0.7036
200,0.6991


Step,Training Loss
50,0.7168
100,0.7202
150,0.6746
200,0.5044


Step,Training Loss
50,0.723
100,0.7098
150,0.7017
200,0.701


Step,Training Loss
50,0.7134
100,0.7059
150,0.7018
200,0.7003


Step,Training Loss
50,0.7137
100,0.7153
150,0.7015
200,0.6991


 83%|████████▎ | 5/6 [1:21:20<16:14, 974.11s/it]

Done.
0.44298580909104607


  0%|          | 0/6 [00:00<?, ?ba/s]

Running 10-fold CV on model:  DeepPavlov/bert-base-bg-cs-pl-ru-cased ...


Step,Training Loss
50,0.5356
100,0.4111
150,0.3865
200,0.354


Step,Training Loss
50,0.5352
100,0.4341
150,0.3854
200,0.3332


Step,Training Loss
50,0.5519
100,0.4509
150,0.4205
200,0.3722


Step,Training Loss
50,0.5562
100,0.4471
150,0.4006
200,0.3458


Step,Training Loss
50,0.5556
100,0.4345
150,0.3947
200,0.377


Step,Training Loss
50,0.536
100,0.4437
150,0.4073
200,0.3621


Step,Training Loss
50,0.5405
100,0.4421
150,0.411
200,0.3601


Step,Training Loss
50,0.5551
100,0.4167
150,0.3866
200,0.343


Step,Training Loss
50,0.5476
100,0.4322
150,0.3971
200,0.3498


Step,Training Loss
50,0.5149
100,0.4234
150,0.3665
200,0.3504


100%|██████████| 6/6 [1:36:12<00:00, 962.10s/it]

Done.
0.7413957174636348





In [11]:
for model_name in config_data['models']:
    print(model_name, "score:", np.mean(model_scores[model_name]))

UWB-AIR/Czert-B-base-cased score: 0.731938680930234
ufal/robeczech-base score: 0.7094267047656141
bert-base-multilingual-cased score: 0.7344790587363879
fav-kky/FERNET-C5 score: 0.7469339088937158
fav-kky/FERNET-News score: 0.44298580909104607
DeepPavlov/bert-base-bg-cs-pl-ru-cased score: 0.7413957174636348
