In [17]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import logging
import json

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

CS_DATA_PATH = PATH + '/data/CS/processed/BABE/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

BATCH_SIZE = 64
logging.disable(logging.ERROR)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

### BABE train_test split (SKIP IF DONE)

In [2]:
babe = load_dataset("csv", data_files=PATH + '/data/CS/raw/BABE/SG2.csv')['train']

Using custom data configuration default-41acc90be2294f89
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [3]:
babe = babe.train_test_split(0.15,seed=42)

Loading cached split indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-2e89d302da86ff73.arrow and /home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-7a7ea8491428011d.arrow


In [4]:
babe['train'].to_csv(PATH + '/data/CS/processed/BABE/train.csv',index=False)
babe['test'].to_csv(PATH + '/data/CS/processed/BABE/test.csv',index=False) #THIS IS FOR THE FINAL MODEL SELECTED,TUNED

125614

## Load data

In [8]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
data

Dataset({
    features: ['sentence', 'label'],
    num_rows: 3122
})

In [9]:
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

## Training

In [11]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=5e-5)

### Cross-Val all models

In [15]:
model_scores = {}

for model_name in tqdm(config_data['models']):
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data,tokenizer,'sentence')

    print("Running 5-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        torch.cuda.manual_seed(12345)
        torch.manual_seed(12345)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    model_scores[model_name] = scores
    

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Running 5-fold CV on model:  UWB-AIR/Czert-B-base-cased ...


  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


Step,Training Loss
50,0.5517
100,0.3018


Step,Training Loss
50,0.5207
100,0.2729


Step,Training Loss
50,0.5245
100,0.3217


Step,Training Loss
50,0.5038
100,0.3075


Step,Training Loss
50,0.5171
100,0.3313


 17%|█▋        | 1/6 [03:52<19:24, 232.88s/it]

Done.
Running 5-fold CV on model:  ufal/robeczech-base ...


Step,Training Loss
50,0.5858
100,0.4239


Step,Training Loss
50,0.5827
100,0.4485


Step,Training Loss
50,0.5503
100,0.403


Step,Training Loss
50,0.5539
100,0.4188


Step,Training Loss
50,0.5864
100,0.4325


 33%|███▎      | 2/6 [07:32<15:01, 225.35s/it]

Done.


  0%|          | 0/4 [00:00<?, ?ba/s]

Running 5-fold CV on model:  bert-base-multilingual-cased ...


Step,Training Loss
50,0.6121
100,0.4036


Step,Training Loss
50,0.6083
100,0.4002


Step,Training Loss
50,0.6152
100,0.4352


Step,Training Loss
50,0.5444
100,0.3862


Step,Training Loss
50,0.5767
100,0.3982


 50%|█████     | 3/6 [12:32<12:58, 259.41s/it]

Done.
Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.5392
100,0.297


Step,Training Loss
50,0.5331
100,0.3379


Step,Training Loss
50,0.5418
100,0.3622


Step,Training Loss
50,0.5004
100,0.3045


Step,Training Loss
50,0.5252
100,0.3334


 67%|██████▋   | 4/6 [16:01<07:58, 239.50s/it]

Done.


  0%|          | 0/4 [00:00<?, ?ba/s]

Running 5-fold CV on model:  fav-kky/FERNET-News ...


Step,Training Loss
50,0.6859
100,0.5531


Step,Training Loss
50,0.6953
100,0.702


Step,Training Loss
50,0.6962
100,0.4962


Step,Training Loss
50,0.7066
100,0.7059


Step,Training Loss
50,0.7013
100,0.6128


 83%|████████▎ | 5/6 [19:49<03:55, 235.22s/it]

Done.


  0%|          | 0/4 [00:00<?, ?ba/s]

Running 5-fold CV on model:  DeepPavlov/bert-base-bg-cs-pl-ru-cased ...


Step,Training Loss
50,0.5588
100,0.3552


Step,Training Loss
50,0.5575
100,0.3663


Step,Training Loss
50,0.5448
100,0.3487


Step,Training Loss
50,0.5385
100,0.3733


Step,Training Loss
50,0.5728
100,0.3777


100%|██████████| 6/6 [23:40<00:00, 236.75s/it]

Done.





NameError: name 'json' is not defined

In [18]:
with open("./results.txt",'w') as f:
    f.write(json.dumps(model_scores))

In [19]:
model_scores

{'UWB-AIR/Czert-B-base-cased': [0.7888235764452205,
  0.7767676741342496,
  0.7722861842105263,
  0.7781076066790352,
  0.7684601113172541],
 'ufal/robeczech-base': [0.7978364002300173,
  0.7719786519899097,
  0.7855365333677886,
  0.7808604038630378,
  0.7719086893398819],
 'bert-base-multilingual-cased': [0.7480762779521559,
  0.7258954083327797,
  0.7074542897327708,
  0.743197248098232,
  0.7467054489044018],
 'fav-kky/FERNET-C5': [0.7691692143545972,
  0.7937502760823076,
  0.763499245852187,
  0.772749227077721,
  0.7599696399389702],
 'fav-kky/FERNET-News': [0.7581269349845201,
  0.32867883995703545,
  0.7278717381479104,
  0.3375796178343949,
  0.7371119719684748],
 'DeepPavlov/bert-base-bg-cs-pl-ru-cased': [0.779151373839658,
  0.7447483611535803,
  0.743183120604324,
  0.7608931019141205,
  0.7497975771341061]}

In [20]:
for model in config_data['models']:
    print(model,"F1 score:",np.mean(model_scores[model]))

UWB-AIR/Czert-B-base-cased F1 score: 0.7768890305572571
ufal/robeczech-base F1 score: 0.7816241357581271
bert-base-multilingual-cased F1 score: 0.7342657346040682
fav-kky/FERNET-C5 F1 score: 0.7718275206611566
fav-kky/FERNET-News F1 score: 0.5778738205784671
DeepPavlov/bert-base-bg-cs-pl-ru-cased F1 score: 0.7555547069291577
