In [1]:
!pip install transformers
!pip install pytorch-ignite



In [0]:
import pandas as pd
from transformers import XLMRobertaConfig, XLMRobertaTokenizer, XLMRobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule

In [3]:
train_df = pd.read_csv('/content/drive/My Drive/Deep Learning Data/Multilingual Toxic Comment Classification/clean_train_df.csv', index_col = 0)
val_df = pd.read_csv('/content/drive/My Drive/Deep Learning Data/Multilingual Toxic Comment Classification/clean_val_df.csv', index_col = 0)

  mask |= (ar1 == a)


In [4]:
train_df.head()

Unnamed: 0,comment_text,toxic,length
0,Explanation\nWhy the edits made under my usern...,0.0,42
1,D'aww! He matches this background colour I'm s...,0.0,18
2,"Hey man, I'm really not trying to edit war. It...",0.0,42
3,"""\nMore\nI can't make any real suggestions on ...",0.0,112
4,"You, sir, are my hero. Any chance you remember...",0.0,13


In [0]:
import re
def clean_text(text):
    text = str(text)
    text = re.sub(r'[0-9"]', '', text)
    text = re.sub(r'#[\S]+\b', '', text)
    text = re.sub(r'@[\S]+\b', '', text)
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
   
    return text

In [0]:
train_df['clean_text'] = train_df['comment_text'].apply(lambda x: clean_text(x))

In [7]:
train_df.head()

Unnamed: 0,comment_text,toxic,length,clean_text
0,Explanation\nWhy the edits made under my usern...,0.0,42,Explanation Why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0.0,18,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It...",0.0,42,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ...",0.0,112,More I can't make any real suggestions on imp...
4,"You, sir, are my hero. Any chance you remember...",0.0,13,"You, sir, are my hero. Any chance you remember..."


In [0]:
from torch.utils.data import Dataset, DataLoader
import torch

class DatasetRetriever(Dataset):

    def __init__(self, df):
        self.comment_texts = df['clean_text'].values
        self.ids = df.index.to_numpy()
        self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        self.df = df
    def get_tokens(self, text):
        encoded = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=256, 
            pad_to_max_length=True,
            return_attention_mask = True
        )
        return encoded['input_ids'], encoded['attention_mask']

    def __len__(self):
        return self.ids.shape[0]

    def __getitem__(self, idx):
        text = self.comment_texts[idx]
        
        #######################################
        # TODO TTA transforms: about it later #
        #######################################
    
        tokens, attention_mask = self.get_tokens(text)
        tokens, attention_mask = torch.tensor(tokens), torch.tensor(attention_mask)
        label = torch.full((1,1), self.df['toxic'][idx]).type(torch.float)
        return tokens, attention_mask, label

In [0]:
from collections import namedtuple
AdaptationConfig = namedtuple('AdaptationConfig',
  field_names="num_classes, dropout, initializer_range, batch_size, lr, max_norm, n_epochs,"
              "n_warmup, valid_set_prop, gradient_accumulation_steps, device,"
              "log_dir, dataset_cache")
adapt_args = AdaptationConfig(
               6          , 0.2    , 0.02             , 4        , 6.5e-5, 1.0   , 3,
               10      , 0.1           , 1, "cuda" if torch.cuda.is_available() else "cpu",
               "/content/drive/My Drive/Deep Learning Data/Multilingual Toxic Comment Classification"   , 
               "/content/drive/My Drive/Deep Learning Data/Multilingual Toxic Comment Classification/dataset_cache.bin")

In [0]:
def get_pos_weight(df):
    """return positive weights for loss function"""
    no_toxic = len(df[df['toxic']>0.5])
    no_nontoxic = len(df) - no_toxic
    pos_weight = no_nontoxic/no_toxic
    return torch.full((1,1),pos_weight).type(torch.float)

In [0]:
##Get positive weight
pos_weight = get_pos_weight(train_df)

In [0]:
##loading dataset
train_dataset = DatasetRetriever(train_df)
train_loader = DataLoader(train_dataset, batch_size = adapt_args.batch_size, num_workers = 1)

In [0]:
## Code to check 1 batch
sample_batch = next(iter(train_loader))
sample_tokens = sample_batch[0]
attention_masks = sample_batch[1]
sample_label = sample_batch[2]
assert sample_tokens.shape == attention_masks.shape

In [0]:
pretrained_model =  XLMRobertaModel.from_pretrained('xlm-roberta-base')
pretrained_config = XLMRobertaConfig.from_pretrained('xlm-roberta-base')

In [0]:
import torch.nn as nn
class ToxicModel(nn.Module):
    def __init__(self, backbone, config, num_classes):
        super().__init__()

        self.transformer = backbone
        self.config = config 
        self.dropout = nn.Dropout(adapt_args.dropout)
        self.classification_head = nn.Linear(config.hidden_size*2, num_classes)
        self.apply(self.init_weight)

    def init_weight(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_normal_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
    

    def forward(self, input_ids, attention_mask, clf_labels):

        hidden_states, _ = self.transformer(input_ids = input_ids, 
                                            attention_mask = attention_mask,
                                            token_type_ids = None, 
                                            )
        
        avg_hidden_state = torch.mean(hidden_states, dim = 1)
        max_hidden_state, _ = torch.max(hidden_states, dim =1)

        cat_vec = torch.cat((avg_hidden_state, max_hidden_state), 1) ## return vector of size (batch_size, 2 * hidden_state)
        cat_vec = self.dropout(cat_vec)

        clf_output = self.classification_head(cat_vec)
        clf_logits = torch.sigmoid(clf_output)

        criterion = torch.nn.BCELoss(weight = pos_weight.to(adapt_args.device))

        clf_labels = clf_labels.squeeze(-1)

        loss = criterion(clf_logits, clf_labels)/adapt_args.batch_size

        ones = torch.ones((clf_logits.shape)).to(adapt_args.device)
        zeros = torch.zeros((clf_logits.shape)).to(adapt_args.device)

        output_labels = torch.where(clf_logits > 0.5,ones, zeros)

        return loss, output_labels

model = ToxicModel(pretrained_model, pretrained_config, 1)
model = model.to(adapt_args.device)





In [35]:
### test one forward pass for the model
batch= next(iter(train_loader))
input_ids = batch[0].to(adapt_args.device)
attention_mask = batch[1].to(adapt_args.device)
labels = batch[2].to(adapt_args.device)

loss, output = model(input_ids, attention_mask, labels)
print(f'Loss value of sample batch is: {loss}')

Loss value of sample batch is: 1.6186506748199463


In [34]:
import os
from ignite.engine import Engine, Events
from ignite.metrics import RunningAverage, Accuracy
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import CosineAnnealingScheduler, PiecewiseLinear, create_lr_scheduler_with_warmup, ProgressBar



In [0]:
## Training loop
optimizer = torch.optim.Adam(model.parameters(), lr = adapt_args.lr)


##update
def update(engine, batch):

    #training mode
    model.train()

    #zero gradient
    optimizer.zero_grad()

    #unwrap data from batch
    inputs = batch[0].to(adapt_args.device)
    
    attention_mask = batch[1].to(adapt_args.device)

    labels = batch[2].to(adapt_args.device)

    ## 1 pass forward of the model
    loss, output_labels = model(inputs, attention_mask, labels)

    loss = loss/adapt_args.gradient_accumulation_steps ## accumulate loss 

    loss.backward()

    ##clipping gradieng

    torch.nn.utils.clip_grad_norm_(model.parameters(), adapt_args.max_norm)

    if engine.state.iteration % adapt_args.gradient_accumulation_steps ==0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()

trainer = Engine(update)

In [0]:
##Evaluation function, evaluator output is the input of the metrics
def inference(engine, batch):

    ##evaluation mode

    model.eval()

    inputs = batch[0].to(adapt_args.device)

    attention_mask = batch[1].to(adapt_args.device)

    labels = batch[2].to(adapt_args.device)

    with torch.no_grad():
        loss, output = model(inputs, attention_mask, labels)

    return output, labels

evaluator = Engine(inference)

In [42]:
### Attach metric to evaluator and evaluation to trainner : evaluate on val set after each epoch
Accuracy().attach(evaluator, 'Accuracy')
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_result(engine):
    evaluator.run(val_loader)
    print(f"Validation Epoch: {engine.state.epoch} Error rate: {100*(1 - evaluator.state.metrics['Accuracy'])}")

# Learning rate schedule: linearly warm-up to lr and then to zero
scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (adapt_args.n_warmup, adapt_args.lr),
                                              (len(train_loader)*adapt_args.n_epochs, 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Save checkpoints and finetuning config
checkpoint_handler = ModelCheckpoint(adapt_args.log_dir, 'finetuning_checkpoint', save_interval=1, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model})
torch.save(model.state_dict(), os.path.join(adapt_args.log_dir, 'fine_tuned_xlm_roberta.pt'))



In [43]:
trainer.run(train_loader, max_epochs=adapt_args.n_epochs)

HBox(children=(IntProgress(value=0, max=531436), HTML(value='')))

Current run is terminating due to exception: .
Exception ignored in: <bound method _MultiProcessingDataLoaderIter.__del__ of <torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7f0ea90b46a0>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 962, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 942, in _shutdown_workers
    w.join()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 124, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 50, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 
Engine run is terminating due to exception: .


KeyboardInterrupt: ignored