In [10]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer
from corpy.morphodita import Tokenizer
from newspaper import Article

import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc
import re

model_checkpoint = 'ufal/robeczech-base'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.logging.set_verbosity(transformers.logging.ERROR)

BATCH_SIZE = 32

In [11]:
def clean_memory():
    gc.collect()
    torch.cuda.empty_cache()
    
def compute_metrics(testing_dataloader):
    metric = load_metric("f1")
    metric2 = load_metric("accuracy")

    model.eval()
    for batch in testing_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        metric2.add_batch(predictions=predictions, references=batch["labels"])

        
    return (metric.compute(average='micro'),metric2.compute())

## Data preprocessing

In [12]:
data = pd.read_csv('../data/BABE/final_labels_SG2.csv',sep=';')
data = data[['text','label_bias']]
final_indices = data.index[data['label_bias'] != 'No agreement'].tolist()
data = data[data['label_bias']!='No agreement']

mapping = {'Non-biased':0, 'Biased':1}
data.replace({'label_bias':mapping},inplace=True)
data_en = data

In [14]:
with open('../data/BABE/texts_CS.txt','r') as f:
    sentences = [sentence.strip('\n') for sentence in f.readlines()]
    sentences = list(filter(lambda x: len(x) != 0, sentences))

In [15]:
sentences = np.array(sentences)[final_indices]

In [16]:
data = Dataset.from_dict({'sentence':sentences,'label':data['label_bias']})

## Training

In [17]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False) #fast tokenizer is buggy in RoBERTa models
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint)
model.to(device);

In [19]:
tokenize = lambda data : tokenizer(data['sentence'], truncation=True)

In [20]:
tokenized_data = data.map(tokenize,batched=True)
tokenized_data = tokenized_data.remove_columns(['sentence'])
tokenized_data.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [21]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=25,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5)

### 5-fold CV

In [22]:
scores = []

In [23]:
for train_index, val_index in skfold.split(tokenized_data['input_ids'],tokenized_data['label']):
    
    token_train = Dataset.from_dict(tokenized_data[train_index])
    token_valid = Dataset.from_dict(tokenized_data[val_index])
    
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()
    
    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(eval_dataloader))


  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  return np.array(array, copy=False, **self.np_array_kwargs)
***** Running training *****
  Num examples = 2938
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 920


Step,Training Loss
25,0.6511
50,0.5185
75,0.5815
100,0.4881
125,0.4173
150,0.4592
175,0.3883
200,0.383
225,0.2925
250,0.2838


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json
Deleting older checkpoint [../checkpoint-1000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": fal

Step,Training Loss
25,0.647
50,0.5591
75,0.5525
100,0.5469
125,0.4516
150,0.4509
175,0.4249
200,0.3812
225,0.2996
250,0.2955


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.6425
50,0.5753
75,0.4929
100,0.4825
125,0.4187
150,0.4673
175,0.4245
200,0.3291
225,0.2986
250,0.2972


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.6455
50,0.5526
75,0.5008
100,0.4956
125,0.4792
150,0.4012
175,0.4022
200,0.3624
225,0.2959
250,0.3052


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.643
50,0.5385
75,0.52
100,0.4967
125,0.4479
150,0.3861
175,0.4233
200,0.3111
225,0.3295
250,0.2828


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [25]:
#torch.save(model.state_dict(),'../cs_babe.pth')
model.load_state_dict(torch.load('../cs_babe.pth'))
model.eval();


In [26]:
scores

[({'f1': 0.7673469387755102}, {'accuracy': 0.7673469387755102}),
 ({'f1': 0.7945578231292517}, {'accuracy': 0.7945578231292517}),
 ({'f1': 0.7904761904761904}, {'accuracy': 0.7904761904761904}),
 ({'f1': 0.7656675749318801}, {'accuracy': 0.7656675749318801}),
 ({'f1': 0.7697547683923706}, {'accuracy': 0.7697547683923706})]

### Inferrence experiments

In [27]:
def classify_sentence(sent:str):
    toksentence = tokenizer(sent,truncation=True,return_tensors="pt")
    model.eval()
    with torch.no_grad():
        toksentence.to(device)
        output = model(**toksentence)
    
    classification = F.softmax(output.logits,dim=1).argmax(dim=1)
    
    return {0:'unbiased',1:'biased'}[classification[0].item()]

### Try on any article

In [88]:
article = Article('https://www.seznamzpravy.cz/clanek/prectete-si-babisuv-projev-udelal-jsem-chybu-kterou-nechci-opakovat-120741?fbclid=IwAR2BpbwVrEVZphQWtEejuBSL7kXa2VHm7jcHlPWZ4NkcuedYR3ZoXHmx_3o')
article.download()
article.parse()

tokenizer_morphodita = Tokenizer("czech")

all = []
for sentence in tokenizer_morphodita.tokenize(article.text, sents=True):
    all.append(sentence)
    
sentences = np.array([' '.join(x) for x in all])
annotations = np.array(list(zip(sentences,list(map(classify_sentence,sentences)))))
stats = np.unique(annotations[:,1],return_counts=True)

print("bias level: ",stats[1][0]/stats[1].sum()*100,"%")

bias level:  21.951219512195124 %


In [89]:
stats

(array(['biased', 'unbiased'], dtype='<U218'), array([18, 64]))

In [91]:
list(filter(lambda x : x[1] == 'biased',annotations))

[array(['Budu k vám jako vždycky upřímný a otevřený .', 'biased'],
       dtype='<U218'),
 array(['Čísla nákazy závratně rostou .', 'biased'], dtype='<U218'),
 array(['Tehdy jsme těch opatření měli všichni až po krk .', 'biased'],
       dtype='<U218'),
 array(['Měli jsme jen minimum mrtvých , což nás tehdy možná ukolébalo v představě , že to s tím virem není tak zlé .',
        'biased'], dtype='<U218'),
 array(['Od března jedeme sedm dní v týdnu a bojujeme s touto ojedinělou situací , na kterou nemohl být nikdo připraven .',
        'biased'], dtype='<U218'),
 array(['Jsou tu lidé , kteří nosí roušky a podporují všechna opatření , a pak jsou tu lidé , kteří považují virus za obyčejnou chřipku .',
        'biased'], dtype='<U218'),
 array(['Teď už vážně nejde o politiku .', 'biased'], dtype='<U218'),
 array(['Nikdo nechce nikoho z plezíru omezovat .', 'biased'],
       dtype='<U218'),
 array(['Ano , je to otravné a ne moc příjemné , ale zachraňuje to naše životy .',
        'biased'],