In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback

from src.utils.myutils import *
import yaml

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

CS_DATA_PATH = PATH + '/data/CS/processed/BABE/train.csv'
CONFIG_PATH = PATH + '/src/utils/config.yaml'

BATCH_SIZE = 64
transformers.utils.logging.set_verbosity_error()

### BABE train_test split (SKIP IF DONE)

In [3]:
babe = load_dataset("csv", data_files=PATH + '/data/CS/raw/BABE/SG2.csv')['train']

Using custom data configuration default-41acc90be2294f89
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [4]:
babe = babe.train_test_split(0.15,seed=42)

Loading cached split indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-2e89d302da86ff73.arrow and /home/horyctom/.cache/huggingface/datasets/csv/default-41acc90be2294f89/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-7a7ea8491428011d.arrow


In [5]:
babe['train'].to_csv(PATH + '/data/CS/processed/BABE/train.csv',index=False)
babe['test'].to_csv(PATH + '/data/CS/processed/BABE/test.csv',index=False) #THIS IS FOR THE FINAL MODEL SELECTED,TUNED

125614

## Load data

In [2]:
data = load_dataset('csv',data_files = CS_DATA_PATH)['train']
data

Using custom data configuration default-f28f8af5b44ab214
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


Dataset({
    features: ['sentence', 'label'],
    num_rows: 3122
})

In [3]:
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)

## Training

In [4]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [5]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,
    eval_steps=20,
    logging_steps=20,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=4e-5)

### Cross-Val all models

In [None]:
model_scores = {}

for model_name in config_data['models']:
    model_name='ufal/robeczech-base'
    scores = []
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_full = preprocess_data(data,tokenizer,'sentence')

    print("Running 5-fold CV on model: ",model_name,"...")
    for train_index, val_index in skfold.split(token_full['input_ids'],token_full['label']):

        token_train = Dataset.from_dict(token_full[train_index])
        token_valid = Dataset.from_dict(token_full[val_index])


        token_train = token_train.train_test_split(0.05)
        train = token_train['train']
        val = token_train['test']
        
        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
        model.to(device)
        trainer = Trainer(model,training_args,train_dataset=train,data_collator=data_collator,tokenizer=tokenizer,eval_dataset=val,
                          compute_metrics=compute_metrics_eval,callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
        trainer.train()

        #evaluation
        eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
        scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        
    print("Done.")
    model_scores[model_name] = scores
    break


Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-f28f8af5b44ab214/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-74b152a0ca0f6626.arrow
  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  return np.array(array, copy=False, **self.np_array_kwargs)


Running 5-fold CV on model:  ufal/robeczech-base ...


***** Running training *****
  Num examples = 2372
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 380


Step,Training Loss,Validation Loss


In [31]:
for model in config_data['models']:
    print(model,"F1 score:",np.mean(model_scores[model]))

UWB-AIR/Czert-B-base-cased F1 score: 0.7408641025641025
ufal/robeczech-base F1 score: 0.7751410256410256
bert-base-multilingual-cased F1 score: 0.7389446153846153
fav-kky/FERNET-C5 F1 score: 0.7620046153846154
fav-kky/FERNET-News F1 score: 0.7181794871794871
DeepPavlov/bert-base-bg-cs-pl-ru-cased F1 score: 0.7604071794871794


In [36]:
token = token_full.train_test_split(0.1)
token_train = token['train']
token_val = token['test']

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer,eval_dataset=token_val,
                          compute_metrics=compute_metrics_eval,callbacks = [EarlyStoppingCallback(early_stopping_patience=4)])
trainer.train()

loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 51961
}

loading weights file https://huggingface.co/ufal/robeczech-base/resolve/m

Step,Training Loss,Validation Loss,F1
20,0.6791,0.62199,0.688963
40,0.5882,0.540674,0.712329
60,0.5234,0.558336,0.679688
80,0.4693,0.531114,0.669291
100,0.3894,0.534705,0.729323
120,0.3629,0.578966,0.713805
140,0.2952,0.695441,0.6639
160,0.2375,0.638551,0.696629
180,0.2288,0.662417,0.738562
200,0.1424,0.714533,0.725275


***** Running Evaluation *****
  Num examples = 313
  Batch size = 64
Saving model checkpoint to ./checkpoint-20
Configuration saved in ./checkpoint-20/config.json
Model weights saved in ./checkpoint-20/pytorch_model.bin
tokenizer config file saved in ./checkpoint-20/tokenizer_config.json
Special tokens file saved in ./checkpoint-20/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 313
  Batch size = 64
Saving model checkpoint to ./checkpoint-40
Configuration saved in ./checkpoint-40/config.json
Model weights saved in ./checkpoint-40/pytorch_model.bin
tokenizer config file saved in ./checkpoint-40/tokenizer_config.json
Special tokens file saved in ./checkpoint-40/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 313
  Batch size = 64
Saving model checkpoint to ./checkpoint-60
Configuration saved in ./checkpoint-60/config.json
Model weights saved in ./checkpoint-60/pytorch_model.bin
tokenizer config file saved in ./checkpoint-60/tokenizer_conf

TrainOutput(global_step=400, training_loss=0.23554600685834884, metrics={'train_runtime': 256.676, 'train_samples_per_second': 109.438, 'train_steps_per_second': 1.714, 'total_flos': 1184856915605040.0, 'train_loss': 0.23554600685834884, 'epoch': 9.09})

In [39]:
token_full = preprocess_data(babe['test'],tokenizer,'sentence')


  0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
eval_dataloader = DataLoader(token_full, batch_size=BATCH_SIZE, collate_fn=data_collator)
compute_metrics(model,device,eval_dataloader)['f1']

0.8003629764065335