In [2]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
import transformers
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback

from src.utils.myutils import *
import yaml
from tqdm import tqdm
import json
import logging

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

CS_DATA_PATH = PATH + '/data/CS/processed/CWNC/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

BATCH_SIZE = 64
logging.disable(logging.ERROR)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

### CWNC train_test split (SKIP IF DONE)

In [2]:
cwnc = load_dataset("csv", data_files=PATH + '/data/CS/raw/CWNC/cwnc.csv')['train']

Using custom data configuration default-e006c2795c52a104


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/horyctom/.cache/huggingface/datasets/csv/default-e006c2795c52a104/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/horyctom/.cache/huggingface/datasets/csv/default-e006c2795c52a104/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


In [3]:
cwnc = cwnc.train_test_split(0.15,seed=42)

In [4]:
cwnc['train'].to_csv(PATH + '/data/CS/processed/CWNC/train.csv',index=False)
cwnc['test'].to_csv(PATH + '/data/CS/processed/CWNC/test.csv',index=False) #THIS IS FOR THE FINAL MODEL SELECTED,TUNED

147467

## Load data

In [3]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
data

Dataset({
    features: ['sentence', 'label'],
    num_rows: 4900
})

In [4]:
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

## Training

In [5]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
training_args = TrainingArguments(
    output_dir = './',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=50,
    disable_tqdm = False,
    warmup_steps=10,
    save_total_limit=2,
    weight_decay=0.1,
    learning_rate=5e-5)

### Cross-Val all models

In [7]:
model_scores = {}

for model_name in tqdm(config_data['models']):
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data,tokenizer,'sentence')

    print("Running 5-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])

        torch.cuda.manual_seed(12345)
        torch.manual_seed(12345)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device);
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer);
        trainer.train();

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    model_scores[model_name] = scores


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Running 5-fold CV on model:  UWB-AIR/Czert-B-base-cased ...


  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


Step,Training Loss
50,0.5814
100,0.3987
150,0.3563


Step,Training Loss
50,0.6073
100,0.4102
150,0.3577


Step,Training Loss
50,0.5991
100,0.4324
150,0.3724


Step,Training Loss
50,0.5626
100,0.4017
150,0.343


Step,Training Loss
50,0.5578
100,0.3871
150,0.3529


 17%|█▋        | 1/6 [06:04<30:22, 364.48s/it]

Done.
Running 5-fold CV on model:  ufal/robeczech-base ...


Step,Training Loss
50,0.6975
100,0.5441
150,0.425


Step,Training Loss
50,0.6905
100,0.5109
150,0.4221


Step,Training Loss
50,0.6496
100,0.4335
150,0.3807


Step,Training Loss
50,0.6829
100,0.4831
150,0.4082


Step,Training Loss
50,0.6723
100,0.443
150,0.4077


 33%|███▎      | 2/6 [12:07<24:14, 363.56s/it]

Done.


  0%|          | 0/5 [00:00<?, ?ba/s]

Running 5-fold CV on model:  bert-base-multilingual-cased ...


Step,Training Loss
50,0.6184
100,0.4339
150,0.4085


Step,Training Loss
50,0.6154
100,0.4454
150,0.4058


Step,Training Loss
50,0.5553
100,0.4256
150,0.389


Step,Training Loss
50,0.654
100,0.4632
150,0.3968


Step,Training Loss
50,0.5738
100,0.4143
150,0.3874


 50%|█████     | 3/6 [19:21<19:47, 395.79s/it]

Done.


  0%|          | 0/5 [00:00<?, ?ba/s]

Running 5-fold CV on model:  fav-kky/FERNET-C5 ...


Step,Training Loss
50,0.7073
100,0.7025
150,0.7014


Step,Training Loss
50,0.5769
100,0.4046
150,0.3518


Step,Training Loss
50,0.7037
100,0.5681
150,0.4534


Step,Training Loss
50,0.7089
100,0.6062
150,0.4032


Step,Training Loss
50,0.7131
100,0.6988
150,0.5559


 67%|██████▋   | 4/6 [24:57<12:24, 372.09s/it]

Done.


  0%|          | 0/5 [00:00<?, ?ba/s]

Running 5-fold CV on model:  fav-kky/FERNET-News ...


Step,Training Loss
50,0.71
100,0.7021
150,0.5316


Step,Training Loss
50,0.6543
100,0.4386
150,0.3841


Step,Training Loss
50,0.714
100,0.7077
150,0.6939


Step,Training Loss
50,0.714
100,0.6154
150,0.5092


Step,Training Loss
50,0.7119
100,0.7029
150,0.7024


 83%|████████▎ | 5/6 [31:23<06:17, 377.18s/it]

Done.


  0%|          | 0/5 [00:00<?, ?ba/s]

Running 5-fold CV on model:  DeepPavlov/bert-base-bg-cs-pl-ru-cased ...


Step,Training Loss
50,0.586
100,0.4205
150,0.381


Step,Training Loss
50,0.5775
100,0.4043
150,0.3543


Step,Training Loss
50,0.5652
100,0.3996
150,0.3528


Step,Training Loss
50,0.6103
100,0.4201
150,0.3757


Step,Training Loss
50,0.6067
100,0.3988
150,0.373


100%|██████████| 6/6 [37:07<00:00, 371.21s/it]

Done.





In [8]:
model_scores

{'UWB-AIR/Czert-B-base-cased': [0.7407774593441749,
  0.7479274252587008,
  0.7485176855448783,
  0.7154626082524618,
  0.7233654049470908],
 'ufal/robeczech-base': [0.7844189016602809,
  0.7733551685772505,
  0.7530241935483871,
  0.7561669154759367,
  0.7591746462799522],
 'bert-base-multilingual-cased': [0.7457529108384777,
  0.7579513088825178,
  0.7305841924398625,
  0.7027241426398738,
  0.7149829408815285],
 'fav-kky/FERNET-C5': [0.6633387010198605,
  0.7418042048196409,
  0.7025482666244376,
  0.7435504469987229,
  0.7427028288519562],
 'fav-kky/FERNET-News': [0.7292058338051637,
  0.7619412515964241,
  0.4012138317508526,
  0.7357441471571906,
  0.3346911065852003],
 'DeepPavlov/bert-base-bg-cs-pl-ru-cased': [0.7325416666666666,
  0.747956821752807,
  0.7295284713724061,
  0.7252846103317421,
  0.7253589284393424]}

In [9]:
for model in config_data['models']:
    print(model,"F1 score:",np.mean(model_scores[model]))

UWB-AIR/Czert-B-base-cased F1 score: 0.7352101166694613
ufal/robeczech-base F1 score: 0.7652279651083616
bert-base-multilingual-cased F1 score: 0.730399099136452
fav-kky/FERNET-C5 F1 score: 0.7187888896629235
fav-kky/FERNET-News F1 score: 0.5925592341789663
DeepPavlov/bert-base-bg-cs-pl-ru-cased F1 score: 0.7321340997125928
