In [1]:
# Making imports convenient
import sys
import os
PATH=os.getcwd().split('/notebooks')[0]
sys.path.insert(1, PATH)

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset,concatenate_datasets
import transformers
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModelForSequenceClassification,AdamW,get_scheduler,TrainingArguments,Trainer,EarlyStoppingCallback
from sklearn.model_selection import ParameterGrid
from src.utils.myutils import *
import yaml
import json
import logging
from tqdm import tqdm

logging.disable(logging.ERROR)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


model_name = 'ufal/robeczech-base'
CONFIG_PATH = PATH + '/src/utils/config.yaml'
WNC_MODEL_PATH = '/home/horyctom/bias-detection-thesis/src/models/trained/wnc_larger_cs_pretrained.pth'

BATCH_SIZE = 16
training_args = TrainingArguments(
            output_dir = './',
            num_train_epochs=3,
            save_total_limit=2,
            disable_tqdm=False,
            per_device_train_batch_size=BATCH_SIZE,  
            warmup_steps=0,
            weight_decay=0.1,
            logging_dir='./',
            learning_rate=2e-5)


skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [2]:
babe = load_dataset('csv',data_files = PATH + '/data/CS/processed/BABE/train.csv')['train']
subj = load_dataset('csv',data_files=PATH + '/data/CS/raw/SUBJ/subj.csv')['train']
basil = load_dataset('csv',data_files=PATH + '/data/CS/raw/BASIL/basil.csv')['train']

In [3]:
model_name = 'ufal/robeczech-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,padding=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

basil = resample(basil)

babe_tok = preprocess_data(babe,tokenizer,'sentence')
unlabelled_tok = preprocess_data(basil,tokenizer,'sentence')

In [63]:
k=100
scores=[]
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    #split for this whole selftraining iteration
    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    unlabelled_tok = preprocess_data(basil,tokenizer,'sentence')
    
    #self training
    while True:
        #print("Iteration :",iterations)
        print("Fitting on ", len(token_train), " data")
        
        #initial training
        torch.cuda.manual_seed(12345)
        torch.manual_seed(12345)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()
        
        #making predictions on unlabelled dataset
        unlabelled_dataloader = DataLoader(unlabelled_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)
        logits = torch.Tensor().to(device)

        #make predictions on unlabelled
        model.eval()
        for batch in unlabelled_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = torch.cat((logits,F.softmax(outputs.logits)))

        
        if len(token_train) > 3500:
            break
        
        #stop when there is not enough of resources
        if len(logits[:,0]) < k or len(logits[:,1]) < k:
            break
            
        #indices of the highest probability ranked predictions
        unbiased_topk_indices = torch.topk(logits[:,0],k)[1]
        biased_topk_indices = torch.topk(logits[:,1],k)[1]
        indices = torch.cat((unbiased_topk_indices,biased_topk_indices)).cpu()
        if torch.topk(logits[:,0],k)[0][-1] < 0.85 or torch.topk(logits[:,1],k)[0][-1] < 0.85:
            print("Uncertain, ending training...")
            break

        #create new augmentation and concat it
        masks = unlabelled_tok[indices]['attention_mask']
        input_ids = unlabelled_tok[indices]['input_ids']
        labels = [0]*len(unbiased_topk_indices) + [1]*len(biased_topk_indices)
        to_add = Dataset.from_dict({'attention_mask':masks,'input_ids':input_ids,'label':labels})
        
        token_train = concatenate_datasets([to_add,token_train])#.shuffle(seed=42)

        #remove them from unlabelled
        all_indices = np.arange(0,len(unlabelled_tok))
        remaining = np.delete(all_indices,indices)
        unlabelled_tok = Dataset.from_dict(unlabelled_tok[remaining])
        
        print("Current val:", compute_metrics(model,device,eval_dataloader)['f1'],"\n")
    
    #evaluation
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print("Final score:",scores[-1],"\n")


[ 0  2  3  5  6  7  8  9 10 11]
Fitting on  2497  data


Step,Training Loss


KeyboardInterrupt: 

In [34]:
np.mean(scores)

0.7802511679237345

In [9]:
scores=[]
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    #split for this whole selftraining iteration
    token_train = Dataset.from_dict(babe_tok[train_index])
    token_valid = Dataset.from_dict(babe_tok[val_index])
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)

    #initial training
    torch.cuda.manual_seed(12345)
    torch.manual_seed(12345)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)
    trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
    trainer.train()
    
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
        

Fitting on  2497  data


Step,Training Loss


Final score: 0.7869009980782344 

Fitting on  2497  data


Step,Training Loss


Final score: 0.7694805194805194 

Fitting on  2498  data


Step,Training Loss


Final score: 0.7797558166795913 

Fitting on  2498  data


Step,Training Loss


Final score: 0.7861863037838367 

Fitting on  2498  data


Step,Training Loss


Final score: 0.7698396935735565 



In [11]:
np.mean(scores)

0.7784326663191476

In [None]:
print("BASELINE:", 0.7784326663191476)

## Experiment with "final" training

In [9]:
train_idx,val_idx = next(skfold.split(babe_tok['input_ids'],babe_tok['label']))

In [5]:
print(train_idx[:10])

[ 0  2  3  5  6  7  8  9 10 11]


In [17]:
k=100
scores=[]
for train_index, val_index in skfold.split(babe_tok['input_ids'],babe_tok['label']):

    #split for this whole selftraining iteration
    token_train = Dataset.from_dict(babe_tok[train_idx])
    token_valid = Dataset.from_dict(babe_tok[val_idx])
    eval_dataloader = DataLoader(token_valid, batch_size=BATCH_SIZE, collate_fn=data_collator)
    unlabelled_tok = preprocess_data(basil,tokenizer,'sentence')
    
    #self training
    while True:
        #print("Iteration :",iterations)
        print("Fitting on ", len(token_train), " data")
        
        #initial training
        torch.cuda.manual_seed(12345)
        torch.manual_seed(12345)
        model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)
        trainer = Trainer(model,training_args,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer)
        trainer.train()
        
        #making predictions on unlabelled dataset
        unlabelled_dataloader = DataLoader(unlabelled_tok, batch_size=BATCH_SIZE, collate_fn=data_collator)
        logits = torch.Tensor().to(device)

        #make predictions on unlabelled
        model.eval()
        for batch in unlabelled_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = torch.cat((logits,F.softmax(outputs.logits)))

        
        if len(token_train) > 3500:
            break
        
        #stop when there is not enough of resources
        if len(logits[:,0]) < k or len(logits[:,1]) < k:
            break
            
        #indices of the highest probability ranked predictions
        unbiased_topk_indices = torch.topk(logits[:,0],k)[1]
        biased_topk_indices = torch.topk(logits[:,1],k)[1]
        indices = torch.cat((unbiased_topk_indices,biased_topk_indices)).cpu()
        
        if torch.topk(logits[:,0],k)[0][-1] < 0.85 or torch.topk(logits[:,1],k)[0][-1] < 0.85:
            print("Uncertain, ending training...")
            break

        #create new augmentation and concat it
        masks = unlabelled_tok[indices]['attention_mask']
        input_ids = unlabelled_tok[indices]['input_ids']
        labels = [0]*len(unbiased_topk_indices) + [1]*len(biased_topk_indices)
        to_add = Dataset.from_dict({'attention_mask':masks,'input_ids':input_ids,'label':labels})
        
        token_train = concatenate_datasets([to_add,token_train])#.shuffle(seed=42)

        #remove them from unlabelled
        all_indices = np.arange(0,len(unlabelled_tok))
        remaining = np.delete(all_indices,indices)
        unlabelled_tok = Dataset.from_dict(unlabelled_tok[remaining])
        
        print("Current val:", compute_metrics(model,device,eval_dataloader)['f1'],"\n")
    
    #evaluation
    scores.append(compute_metrics(model,device,eval_dataloader)['f1'])
    print("Final score:",scores[-1],"\n")
    break


Fitting on  2497  data


Step,Training Loss


Current val: 0.7869009980782344 

Fitting on  2697  data


Step,Training Loss
500,0.4239


Current val: 0.7877955429347491 

Fitting on  2897  data


KeyboardInterrupt: 

## exp with early stopping

In [18]:
training_args_pretrain = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    eval_steps=50,
    logging_steps=50,
    disable_tqdm = False,
    warmup_steps=0,
    save_total_limit=5,
    evaluation_strategy="steps",
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    weight_decay=0.1,
    output_dir = './',
    learning_rate=2e-5)

ERROR! Session/line number was not unique in database. History logging moved to new session 459


In [19]:
def compute_metrics_eval(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(average='macro',predictions=predictions, references=labels)

In [20]:
token_train = Dataset.from_dict(babe_tok[train_idx])
token_valid = Dataset.from_dict(babe_tok[val_idx])

torch.cuda.manual_seed(12345)
torch.manual_seed(12345)
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model.to(device)
trainer = Trainer(model,training_args_pretrain,train_dataset=token_train,data_collator=data_collator,tokenizer=tokenizer,eval_dataset=token_valid,
                          compute_metrics=compute_metrics_eval,callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
trainer.train()
#torch.save(model.state_dict(),SUBJ_MODEL_PATH)

Step,Training Loss,Validation Loss,F1
50,0.6313,0.544651,0.728648
100,0.6015,0.502269,0.775696
150,0.5355,0.602641,0.679414
200,0.5054,0.564966,0.734828
250,0.4505,0.468531,0.792476
300,0.4343,0.49193,0.76068
350,0.3994,0.528507,0.77751
400,0.3348,0.577383,0.758299


TrainOutput(global_step=400, training_loss=0.48658166408538817, metrics={'train_runtime': 101.7822, 'train_samples_per_second': 245.328, 'train_steps_per_second': 15.425, 'total_flos': 248452377910500.0, 'train_loss': 0.48658166408538817, 'epoch': 2.55})