In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
WNC_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/wnc_cs_pretrained.pth'

training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=16,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)

BATCH_SIZE = 16
transformers.utils.logging.set_verbosity_error()

In [2]:
import logging
logging.disable(logging.ERROR)

In [3]:
data_babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
data_cwnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/CWNC/train.csv')['train']
data_wnc = load_dataset('csv',data_files = PATH + '/data/CS/processed/WNC/wnc.csv')['train']

In [4]:
with open(CONFIG_PATH) as f:
    config_data = yaml.load(f, Loader=yaml.FullLoader)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

babe_tok = preprocess_data(data_babe,tokenizer,'sentence')
cwnc_tok = preprocess_data(data_cwnc,tokenizer,'sentence')
wnc_tok = preprocess_data(data_wnc,tokenizer,'sentence')

In [5]:
wnc_tok = wnc_tok.train_test_split(0.05)

In [6]:
training_args_pretrain = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    eval_steps=5000,
    logging_steps=5000,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=10,
    save_steps = 10000,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=4e-5)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args_pretrain,train_dataset=wnc_tok['train'],data_collator=data_collator,tokenizer=tokenizer,eval_dataset=wnc_tok['test'],
                          compute_metrics=compute_metrics_eval)#,callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
trainer.train()
torch.save(model.state_dict(),WNC_MODEL_PATH) 

Step,Training Loss,Validation Loss,F1
5000,0.6564,0.667833,0.659358
10000,0.6886,0.693318,0.664263
15000,0.6939,0.695301,0.0
20000,0.6935,0.693137,0.0
25000,0.6934,0.693138,0.0
