In [74]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1,PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer


import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc

from src.utils.myutils import clean_memory,compute_metrics,preprocess_data

model_checkpoint = 'roberta-base'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.utils.logging.set_verbosity_error()
BATCH_SIZE = 16

## Data preprocessing

In [33]:
data = load_dataset('csv',data_files=PATH+"/data/EN/processed/babe_sg2.csv")['train']

Using custom data configuration default-57fcf4cd5490a3c0
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-57fcf4cd5490a3c0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


## Training

In [34]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint);
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
model.to(device);

In [36]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    warmup_steps=50,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.05)

In [37]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [39]:
tokenized_data = preprocess_data(data,tokenizer,'text')

Loading cached processed dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-57fcf4cd5490a3c0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-addea159e382c769.arrow


### 5-fold CV

In [48]:
f1_scores = []

In [75]:
for train_index, val_index in skfold.split(data['text'],data['label']):
    
    token_train = Dataset.from_dict(tokenized_data[train_index])
    token_valid = Dataset.from_dict(tokenized_data[val_index])
    
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()
    
    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    f1_scores.append(compute_metrics(model,device,eval_dataloader)['f1'])


  return np.array(array, copy=False, **self.np_array_kwargs)
***** Running training *****
  Num examples = 2938
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 552


Step,Training Loss
50,0.648
100,0.5323
150,0.5846
200,0.5403


KeyboardInterrupt: 

In [52]:
f1_scores

[0.8204081632653062,
 0.8394557823129252,
 0.8326530612244899,
 0.8024523160762943,
 0.8147138964577657]

### Inferrence experiments

In [59]:
sentence = 'Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change.'
#sentence = 'This might be biased but mustache suits you.'
toksentence = tokenizer(sentence,truncation=True,return_tensors="pt")
model.eval()
with torch.no_grad():
    toksentence.to(device)
    output = model(**toksentence)

In [60]:
classification = F.softmax(output.logits,dim=1).argmax(dim=1)
print(sentence,': ',{0:'unbiased',1:'biased'}[classification[0].item()])

Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change. :  unbiased
