In [35]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets import load_metric,load_dataset,Dataset

import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding,RobertaForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer
from corpy.morphodita import Tokenizer
from newspaper import Article

import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,StratifiedKFold
from tqdm.auto import tqdm, trange

import csv
import gc
import re

model_checkpoint = 'ufal/robeczech-base'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
transformers.logging.set_verbosity(transformers.logging.ERROR)

BATCH_SIZE = 32

In [36]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def clean_memory():
    gc.collect()
    torch.cuda.empty_cache()
    
def compute_metrics(testing_dataloader):
    metric = load_metric("f1")
    metric2 = load_metric("accuracy")

    model.eval()
    for batch in testing_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        metric2.add_batch(predictions=predictions, references=batch["labels"])

        
    return (metric.compute(average='micro'),metric2.compute())

## Data preprocessing

In [37]:
data = pd.read_csv('../data/BABE/final_labels_SG2.csv',sep=';')
data = data[['text','label_bias']]
final_indices = data.index[data['label_bias'] != 'No agreement'].tolist()
data = data[data['label_bias']!='No agreement']

mapping = {'Non-biased':0, 'Biased':1}
data.replace({'label_bias':mapping},inplace=True)
data_en = data

In [38]:
with open('../data/BABE/texts_CS.txt','r') as f:
    sentences = [sentence.strip('\n') for sentence in f.readlines()]
    sentences = list(filter(lambda x: len(x) != 0, sentences))

In [39]:
sentences = np.array(sentences)[final_indices]

In [40]:
data = Dataset.from_dict({'sentence':sentences,'label':data['label_bias']})

## Training

In [41]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [42]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False) #fast tokenizer is buggy in RoBERTa models
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint)
model.to(device);

In [43]:
tokenize = lambda data : tokenizer(data['sentence'], truncation=True)

In [44]:
tokenized_data = data.map(tokenize,batched=True)
tokenized_data = tokenized_data.remove_columns(['sentence'])
tokenized_data.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [45]:
training_args = TrainingArguments(
    output_dir='../',
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,  
    logging_steps=25,
    disable_tqdm = False,
    save_total_limit=2,
    learning_rate=5e-5)

### 5-fold CV

In [109]:
scores = []

In [110]:
for train_index, val_index in skfold.split(tokenized_data['input_ids'],tokenized_data['label']):
    
    token_train = Dataset.from_dict(tokenized_data[train_index])
    token_valid = Dataset.from_dict(tokenized_data[val_index])
    
    model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,
                      tokenizer=tokenizer)
    trainer.train()
    
    #evaluation
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    scores.append(compute_metrics(eval_dataloader))


  return np.array(array, copy=False, **self.np_array_kwargs)
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
  return np.array(array, copy=False, **self.np_array_kwargs)
***** Running training *****
  Num examples = 2938
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 920


Step,Training Loss
25,0.6191
50,0.5518
75,0.5511
100,0.5159
125,0.4458
150,0.4321
175,0.3984
200,0.3528
225,0.3013
250,0.2801


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.647
50,0.5591
75,0.5525
100,0.5469
125,0.4516
150,0.4509
175,0.4249
200,0.3812
225,0.2996
250,0.2955


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.6425
50,0.5753
75,0.4929
100,0.4825
125,0.4187
150,0.4673
175,0.4245
200,0.3291
225,0.2986
250,0.2972


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.6455
50,0.5526
75,0.5008
100,0.4956
125,0.4792
150,0.4012
175,0.4022
200,0.3624
225,0.2959
250,0.3052


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


  return np.array(array, copy=False, **self.np_array_kwargs)
loading configuration file https://huggingface.co/ufal/robeczech-base/resolve/main/config.json from cache at /home/horyctom/.cache/huggingface/transformers/967e55aeea0667ffcda38959128e06f755d387fa034ffb448cab0851f27c5104.ae62083e57028e6866dba352dfd4261396c2f0e8978f299e3a17c055c564de09
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 7

Step,Training Loss
25,0.643
50,0.5385
75,0.52
100,0.4967
125,0.4479
150,0.3861
175,0.4233
200,0.3111
225,0.3295
250,0.2828


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [111]:
#torch.save(model.state_dict(),'../cs_babe.pth')
model.load_state_dict(torch.load('../cs_babe.pth'))
model.eval();


In [117]:
scores

[({'f1': 0.782312925170068}, {'accuracy': 0.782312925170068}),
 ({'f1': 0.7945578231292517}, {'accuracy': 0.7945578231292517}),
 ({'f1': 0.7904761904761904}, {'accuracy': 0.7904761904761904}),
 ({'f1': 0.7656675749318801}, {'accuracy': 0.7656675749318801}),
 ({'f1': 0.7697547683923706}, {'accuracy': 0.7697547683923706})]

## Training on full data 

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_checkpoint);
trainer = Trainer(model,training_args,train_dataset=tokenized_data,data_collator=data_collator,
                      tokenizer=tokenizer)
trainer.train()

***** Running training *****
  Num examples = 3673
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1150


Step,Training Loss
25,0.6406
50,0.5466
75,0.5167
100,0.5565
125,0.4797
150,0.4459
175,0.4054
200,0.4123
225,0.4411
250,0.3311


Saving model checkpoint to ../checkpoint-500
Configuration saved in ../checkpoint-500/config.json
Model weights saved in ../checkpoint-500/pytorch_model.bin
tokenizer config file saved in ../checkpoint-500/tokenizer_config.json
Special tokens file saved in ../checkpoint-500/special_tokens_map.json


In [8]:
torch.save(model.state_dict(),'../cs_babe.pth')
model.load_state_dict(torch.load('../cs_babe.pth'))
model.eval();

# Inferrence experiments

In [5]:
def classify_sentence(sent:str):
    toksentence = tokenizer(sent,truncation=True,return_tensors="pt")
    model.eval()
    with torch.no_grad():
        toksentence.to(device)
        output = model(**toksentence)
    
    classification = F.softmax(output.logits,dim=1).argmax(dim=1)
    
    return {0:'unbiased',1:'biased'}[classification[0].item()]

### Nice pairs

In [185]:
print(classify_sentence("Od posledního Štědrýho večera a následující noci na mě máma ani jednou nepromluvila."))
print(classify_sentence("Od posledního Štědrýho večera a následující noci na mě guru máma ani jednou nepromluvila."))

unbiased
biased


In [187]:
print(classify_sentence('Podle íránské vlády bylo sestřelení IR 655 "Vincennes" úmyslně provedeným a nezákonným činem.'))
print(classify_sentence('Sestřelení IR 655 "Vincennes" bylo úmyslně provedeným a nezákonným činem.'))

unbiased
biased


### CW-hard biased data

In [190]:
with open("biased_cwhard_cs.txt","r") as f:
    cw_hard_cs = f.read().splitlines()

In [191]:
annotations = np.array(list(zip(cw_hard_cs,list(map(classify_sentence,cw_hard_cs)))))

In [195]:
stats = np.unique(annotations[:,1],return_counts=True)
print("bias level: ",stats[1][0]/stats[1].sum()*100,"%")

bias level:  32.44709712425393 %


### Try on any article

In [163]:
article = Article('https://nazory.aktualne.cz/komentare/jak-jsem-na-stedrej-den-zradila-nasi-antivax-familiji/r~93a757c460d211eca1070cc47ab5f122/')
article.download()
article.parse()

tokenizer_morphodita = Tokenizer("czech")

all = []
for sentence in tokenizer_morphodita.tokenize(article.text, sents=True):
    all.append(sentence)
    
sentences = np.array([' '.join(x) for x in all])
annotations = np.array(list(zip(sentences,list(map(classify_sentence,sentences)))))
stats = np.unique(annotations[:,1],return_counts=True)

print("bias level: ",stats[1][0]/stats[1].sum()*100,"%")

bias level:  52.32974910394266 %


In [165]:
list(filter(lambda x : x[1] == 'biased',annotations))

[array(['Od posledního Štědrýho večera a následující noci na mě guru máma ani jednou nepromluvila .',
        'biased'], dtype='<U588'),
 array(['" Zrádce národa , " vykřikla máma a bouchla dlaní do volantu , " zrádce zasranej " .',
        'biased'], dtype='<U588'),
 array(['Jo , abych nezapomněla , ono to sem patří , zrádcem národa je nějakej epidemiolog , jméno mi uteklo , kterej nabádá , ať se lidi očkujou .',
        'biased'], dtype='<U588'),
 array(['On je typickej podpantoflák , ale poslední dobou se taky maličko změnil a občas aspoň pípne , asi už je toho i na něj trochu moc .',
        'biased'], dtype='<U588'),
 array(['Jemu neodpověděla , ale otočila se na mě dozadu , přitom jela na úzký silnici přes osmdesát , a povídá : " Chápej , Mončo , " ježíšmarjá , jak já nenávidím to svý jméno , Monika , ale k úplnýmu šílenství mě dohání Monča , Mončičák a Mončičáček , vraždila bych kvůli tomu , " neříkej mi Mončo , prosím , " zaúpěla jsem a ona sadisticky , " chápej , Mončičáčku , 