In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1,PATH)

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer


import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc

from src.utils.myutils import clean_memory,compute_metrics,preprocess_data

model_checkpoint = 'roberta-base'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.utils.logging.set_verbosity_error()
BATCH_SIZE = 16

## Data preprocessing

In [2]:
data = load_dataset('csv',data_files=PATH+"/data/EN/processed/BABE/babe_sg2.csv")['train']

Using custom data configuration default-9aed32b2774fea6c
Reusing dataset csv (/home/horyctom/.cache/huggingface/datasets/csv/default-9aed32b2774fea6c/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [3]:
data = data.train_test_split(0.2)
data

Loading cached split indices for dataset at /home/horyctom/.cache/huggingface/datasets/csv/default-9aed32b2774fea6c/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-e14ec89e5ba972e8.arrow and /home/horyctom/.cache/huggingface/datasets/csv/default-9aed32b2774fea6c/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-46b43f2365d207ce.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2938
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 735
    })
})

In [8]:
data_train = data['train']
data_test = data['test']

## Training

In [9]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint);
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
model.to(device);

In [11]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    warmup_steps=50,  
    logging_steps=50,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.05)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
tokenized_train = preprocess_data(data_train,tokenizer,'text')
tokenized_test = preprocess_data(data_test,tokenizer,'text')

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### 5-fold CV

In [14]:
f1_scores = []

In [16]:
for train_index, val_index in skfold.split(data_train['text'],data_train['label']):
    
    token_train = Dataset.from_dict(tokenized_train[train_index])
    token_valid = Dataset.from_dict(tokenized_train[val_index])
    
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()
    
    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    f1_scores.append(compute_metrics(model,device,eval_dataloader)['f1'])


  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
***** Running training *****
  Num examples = 2350
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 441


Step,Training Loss
50,0.6536
100,0.5733
150,0.4935
200,0.3675
250,0.3766
300,0.3399
350,0.2307
400,0.2037




Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "

Step,Training Loss
50,0.6531
100,0.5524
150,0.5206
200,0.4254
250,0.4095
300,0.3584
350,0.2417
400,0.3003




Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "

Step,Training Loss
50,0.6374
100,0.5083
150,0.5039
200,0.3482
250,0.3761
300,0.2952
350,0.1827
400,0.2549




Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "

Step,Training Loss
50,0.6608
100,0.5703
150,0.5238
200,0.4376
250,0.4364
300,0.4035
350,0.2699
400,0.238




Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "

Step,Training Loss
50,0.6642
100,0.5808
150,0.5392
200,0.4527
250,0.4129
300,0.3832
350,0.2557
400,0.2579




Training completed. Do not forget to share your model on huggingface.co/models =)




In [19]:
np.mean(f1_scores)

0.8110993289990611

In [20]:
test_dataloader = DataLoader(tokenized_test, batch_size=BATCH_SIZE, collate_fn=data_collator)

In [25]:
print(data['train']['text'][0])

Like many Americans also crippled by student loans, Brewster’s been making sacrifices since she graduated: She put off her wedding for two years and once she and her husband, Ryan, finally got married in November 2018, they decided they couldn’t afford a honeymoon.


In [28]:
compute_metrics(model,device,test_dataloader)['f1']

0.8149659863945579

### Inferrence experiments

In [59]:
sentence = 'Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change.'
#sentence = 'This might be biased but mustache suits you.'
toksentence = tokenizer(sentence,truncation=True,return_tensors="pt")
model.eval()
with torch.no_grad():
    toksentence.to(device)
    output = model(**toksentence)

In [60]:
classification = F.softmax(output.logits,dim=1).argmax(dim=1)
print(sentence,': ',{0:'unbiased',1:'biased'}[classification[0].item()])

Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change. :  unbiased
