In [31]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1,PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer


import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc
import random
import logging

from src.utils.myutils import clean_memory,compute_metrics,preprocess_data

model_name = 'roberta-base'

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

BATCH_SIZE = 64

In [15]:
data = load_dataset('csv',data_files=PATH+"/data/EN/processed/BABE/babe_sg2.csv")['train']
data = data.train_test_split(0.15,seed=42)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3122
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 551
    })
})

In [16]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(average='macro',predictions=predictions, references=labels)

In [17]:
data_train = data['train']
data_test = data['test']

In [18]:
BATCH_SIZE=32

In [13]:
training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,
    eval_steps=2000,
    logging_steps=2000,
    save_steps=2000,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.2,
    output_dir = './',
    learning_rate=1e-5)

In [14]:
#Prep data
data_wnc = load_dataset('csv',data_files = '/home/horyctom/bias-detection-thesis/data/EN/processed/WNC/wnc.csv')['train']
data_wnc = data_wnc.train_test_split(0.1)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train = preprocess_data(data_wnc['train'],tokenizer,'sentence')
test = preprocess_data(data_wnc['test'],tokenizer,'sentence')

#Train

model = RobertaForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args,train_dataset=train,data_collator=data_collator,tokenizer=tokenizer,eval_dataset=test,
                          compute_metrics=compute_metrics_eval)

trainer.train()
torch.save(model.state_dict(),'/home/horyctom/bias-detection-thesis/src/models/trained/wncen_pretrained.pth')

  0%|          | 0/327 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

Step,Training Loss,Validation Loss,F1
2000,0.6127,0.56678,0.693577
4000,0.5667,0.560621,0.696465
6000,0.5629,0.549666,0.707594
8000,0.5568,0.543451,0.71134
10000,0.5517,0.54231,0.713062


In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name);
model = RobertaForSequenceClassification.from_pretrained(model_name);
model.to(device);

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    warmup_steps=0,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.1)

In [26]:
tokenized_train = preprocess_data(data_train,tokenizer,'text')
tokenized_test = preprocess_data(data_test,tokenizer,'text')

In [27]:
f1_scores = []
np.random.seed(2018)
torch.manual_seed(2018)   
random.seed(2018)    
torch.cuda.manual_seed_all(2018)
random.seed(2018)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [29]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
for train_index, val_index in skfold.split(data_train['text'],data_train['label']):
    
    token_train = Dataset.from_dict(tokenized_train[train_index])
    token_valid = Dataset.from_dict(tokenized_train[val_index])
    
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
    model.load_state_dict(torch.load('/home/horyctom/bias-detection-thesis/src/models/trained/wncen_pretrained.pth'))

    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()
    
    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    f1_scores.append(compute_metrics(model,device,eval_dataloader)['f1'])


Step,Training Loss
50,0.4339
100,0.272


Step,Training Loss
50,0.4672
100,0.2878


Step,Training Loss
50,0.4434
100,0.2845


Step,Training Loss
50,0.4259
100,0.2917


Step,Training Loss
50,0.4311
100,0.2674
