In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
import re
import warnings
from tqdm import tqdm
import random
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from transformers import AutoTokenizer, AutoModel
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import time 
import copy
import collections
from collections import defaultdict
import gc
import os

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [2]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [3]:
data=pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [4]:
data['less_toxic']=data['less_toxic'].apply(text_cleaning)
data['more_toxic']=data['more_toxic'].apply(text_cleaning)

In [5]:
data.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks woo woo wooooooo,WHAT WHER IS YOUR SEXY PIC GONE FROM YOUR MAIN...
1,188,And yes people should recognize that but they ...,Daphne Guinness Top of the mornin my favourite...
2,82,Western Media Yup because every crime in the e...,Atom you don t believe actual photos of mastur...
3,347,And you removed it You numbskull I don t care ...,You seem to have sand in your vagina Might wan...
4,539,smelly vagina Bluerasberry why don t you be a ...,hey way to support nazis you racist


In [6]:
params={
    
    'DEBUG':False,
    'device':device,
    'max_len':256,
    'batch_size':8,
    'output_logits':768,
    "dropout":0.2,
    "num_workers":2,
    "BERT_MODEL":'roberta-base',
    "train_batch_size": 16,
    "valid_batch_size": 8,
    "learning_rate": 1e-4,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "n_fold": 5,
    "n_accumulate": 1,
    "num_classes": 1,
    "margin": 0.5
}

In [7]:
class jigsaw_dataset(Dataset):
    def __init__(self,data,tokenizer,max_length=params['max_len']):
        self.more_toxic = data.more_toxic.values
        self.less_toxic = data.less_toxic.values
        self.target=data.target.values
        self.tokenizer= tokenizer
        self.max_len=max_length
        self.data=data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        more_toxic = self.more_toxic[index]
        less_toxic=self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        more_toxic_ids=inputs_more_toxic['input_ids']
        more_toxic_mask =inputs_more_toxic['attention_mask']
        less_toxic_ids=inputs_less_toxic['input_ids']
        less_toxic_mask =inputs_less_toxic['attention_mask']
        target=self.target[index]
        return {
            
            'More_toxic_id':torch.tensor(more_toxic_ids),
            'More_toxic_mask':torch.tensor(more_toxic_mask),
            'Less_toxic_mask':torch.tensor(less_toxic_mask),
            'Less_toxic_id':torch.tensor(more_toxic_ids),
            'Target' :torch.tensor(target)
        }
        

In [8]:
class BERT_Baseline(nn.Module):
    def __init__(self,model_name):
        super(BERT_Baseline,self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.flayer=nn.Linear(768,params['num_classes'])
    
    def forward(self,ids,mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out= self.drop(out[1])
        out=self.flayer(out)
        return out

In [9]:
def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin=0.5)(outputs1, outputs2, targets)

In [10]:
data['target']=[1]*len(data)

In [11]:
new_df=pd.DataFrame({'less_toxic':data['more_toxic'].values,'more_toxic':data['less_toxic'].values})

In [12]:
new_df['target']=[-1]*len(new_df)

In [13]:
new_df

Unnamed: 0,less_toxic,more_toxic,target
0,WHAT WHER IS YOUR SEXY PIC GONE FROM YOUR MAIN...,This article sucks woo woo wooooooo,-1
1,Daphne Guinness Top of the mornin my favourite...,And yes people should recognize that but they ...,-1
2,Atom you don t believe actual photos of mastur...,Western Media Yup because every crime in the e...,-1
3,You seem to have sand in your vagina Might wan...,And you removed it You numbskull I don t care ...,-1
4,hey way to support nazis you racist,smelly vagina Bluerasberry why don t you be a ...,-1
...,...,...,...
30103,get out my large penis,I m sorry I m not an admin I will give you thr...,-1
30104,get out my large penis,I m sorry I m not an admin I will give you thr...,-1
30105,Piss off you slant eyed gook,wow are you out of your mind how was my edit o...,-1
30106,Piss off you slant eyed gook,wow are you out of your mind how was my edit o...,-1


In [14]:
data=pd.concat([data.drop(labels='worker',axis=1),new_df],axis=0)

In [15]:
data

Unnamed: 0,less_toxic,more_toxic,target
0,This article sucks woo woo wooooooo,WHAT WHER IS YOUR SEXY PIC GONE FROM YOUR MAIN...,1
1,And yes people should recognize that but they ...,Daphne Guinness Top of the mornin my favourite...,1
2,Western Media Yup because every crime in the e...,Atom you don t believe actual photos of mastur...,1
3,And you removed it You numbskull I don t care ...,You seem to have sand in your vagina Might wan...,1
4,smelly vagina Bluerasberry why don t you be a ...,hey way to support nazis you racist,1
...,...,...,...
30103,get out my large penis,I m sorry I m not an admin I will give you thr...,-1
30104,get out my large penis,I m sorry I m not an admin I will give you thr...,-1
30105,Piss off you slant eyed gook,wow are you out of your mind how was my edit o...,-1
30106,Piss off you slant eyed gook,wow are you out of your mind how was my edit o...,-1


In [16]:
data['Fold']=[None]*len(data)

In [17]:
kfold = KFold(5)
count=0
for i,j in kfold.split(data):
    data['Fold'].iloc[j]=count
    count=count+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [18]:
def data_generator(data,fold):
    train_data=data[data['Fold']!=fold]
    valid_data=data[data['Fold']!=fold]
    tokenizer=AutoTokenizer.from_pretrained(params['BERT_MODEL'])
    train_dataset=jigsaw_dataset(train_data,tokenizer)
    valid_dataset=jigsaw_dataset(valid_data,tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=params['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=params['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    return train_loader,valid_loader

In [19]:
def calculate_accuracy(more_toxic,less_toxic,target):
    preds=[]
    for i in range(len(more_toxic)):
        if more_toxic[i]>less_toxic[i]:
            preds.append(1)
        else:
            preds.append(-1)
    preds=np.array(preds)
    return accuracy_score(target,preds)

In [20]:
def train_one_epoch(train_dataloader,model,optimizer,device):
    model.train()
    bar = tqdm(train_dataloader)
    running_loss=0
    running_accuracy=0
    for count,data in enumerate(bar):
        optimizer.zero_grad()
        more_toxic_ids =data['More_toxic_id'].to(device, torch.long)
        more_toxic_mask =data['More_toxic_mask'].to(device, torch.long)
        less_toxic_ids =data['Less_toxic_id'].to(device, torch.long)
        less_toxic_mask =data['Less_toxic_mask'].to(device, torch.long)
        target =data['Target'].to(device, torch.long)
        more_toxic_out=model(more_toxic_ids,more_toxic_mask)
        less_toxic_out=model(less_toxic_ids,less_toxic_mask)
        loss=criterion(more_toxic_out,less_toxic_out,target)
        loss.backward()
        optimizer.step()
        running_loss=running_loss+loss.item()
        #running_accuracy =running_accuracy+calculate_accuracy(more_toxic_out,less_toxic_out,target)
    final_loss = running_loss/len(train_dataloader)
    gc.collect()
    return final_loss

In [21]:
def validate_model(valid_dataloader,model,device):
    model.eval()
    bar=tqdm(valid_dataloader)
    running_loss=0
    for count,data in enumerate(bar):
        more_toxic_ids =data['More_toxic_id'].to(device, torch.long)
        more_toxic_mask =data['More_toxic_mask'].to(device, torch.long)
        less_toxic_ids =data['Less_toxic_id'].to(device, torch.long)
        less_toxic_mask =data['Less_toxic_mask'].to(device, torch.long)
        target =data['Target'].to(device, torch.long)
        more_toxic_out=model(more_toxic_ids,more_toxic_mask)
        less_toxic_out=model(less_toxic_ids,less_toxic_mask)
        loss=criterion(more_toxic_out,less_toxic_out,target)
        running_loss=running_loss+loss.item()

    final_loss = running_loss/len(valid_dataloader)
    gc.collect()
    return final_loss
    

In [22]:
def get_accuracy(dataloader,model):
    model.eval()
    bar=tqdm(dataloader)
    running_accuracy=0
    for count,data in enumerate(bar):
        more_toxic_ids =data['More_toxic_id'].to(device, torch.long)
        more_toxic_mask =data['More_toxic_mask'].to(device, torch.long)
        less_toxic_ids =data['Less_toxic_id'].to(device, torch.long)
        less_toxic_mask =data['Less_toxic_mask'].to(device, torch.long)
        target =data['Target']
        more_toxic_out=model(more_toxic_ids,more_toxic_mask)
        less_toxic_out=model(less_toxic_ids,less_toxic_mask)
        more_toxic_out=more_toxic_out.cpu().data.numpy()
        less_toxic_out=less_toxic_out.cpu().data.numpy()
        target=target.detach().numpy()
        accuracy=calculate_accuracy(more_toxic_out,less_toxic_out,target)
        running_accuracy+=accuracy
        gc.collect()
    return running_accuracy/len(dataloader)
        

In [23]:
!mkdir FOLD-0
!mkdir FOLD-1
!mkdir FOLD-2
!mkdir FOLD-3
!mkdir FOLD-4

In [24]:
def run_training(epochs=2):
    start = time.time()
    best_epoch_loss = np.inf
    history = defaultdict(list)
    #model=BERT_Baseline(params['BERT_MODEL'])
    model=BERT_Baseline(params['BERT_MODEL'])
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
    for i in range(5):
        train,valid=data_generator(data,i)
        model=BERT_Baseline(params['BERT_MODEL'])
        model.to(device)
        print('FOLD:'+str(i))
        for j in range(epochs):
            print('epoch:'+str(j))
            train_loss=train_one_epoch(train,model,optimizer,device)
            valid_loss=validate_model(valid,model,device)
            print('Train_loss:'+str(train_loss))
            print('valid_loss:'+str(valid_loss))

        torch.save(model.state_dict(),"FOLD-"+str(i)+'/weight.pt')
            
            
             

In [25]:

run_training()

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD:0
epoch:0


100%|██████████| 3010/3010 [38:19<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:44<00:00,  7.30it/s]


Train_loss:0.49759755822709234
valid_loss:0.4985303106655595
epoch:1


100%|██████████| 3010/3010 [38:21<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:53<00:00,  7.23it/s]


Train_loss:0.4976530075073242
valid_loss:0.4985303106655595


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD:1
epoch:0


100%|██████████| 3010/3010 [38:22<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:53<00:00,  7.23it/s]


Train_loss:0.4984000516492267
valid_loss:0.49862596233771594
epoch:1


100%|██████████| 3010/3010 [38:23<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:53<00:00,  7.22it/s]


Train_loss:0.4985119467953907
valid_loss:0.49862596233771594


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD:2
epoch:0


100%|██████████| 3010/3010 [38:22<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:49<00:00,  7.26it/s]


Train_loss:0.4999754345496232
valid_loss:0.49931988038422065
epoch:1


100%|██████████| 3010/3010 [38:20<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:47<00:00,  7.28it/s]


Train_loss:0.5000387864156419
valid_loss:0.49931988038422065


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD:3
epoch:0


100%|██████████| 3010/3010 [38:20<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:49<00:00,  7.26it/s]


Train_loss:0.5002215297119166
valid_loss:0.5014765857808893
epoch:1


100%|██████████| 3010/3010 [38:21<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:45<00:00,  7.30it/s]


Train_loss:0.4997791439591848
valid_loss:0.5014765857808893


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FOLD:4
epoch:0


100%|██████████| 3010/3010 [38:21<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:46<00:00,  7.28it/s]


Train_loss:0.49979803134436623
valid_loss:0.5018906398385278
epoch:1


100%|██████████| 3010/3010 [38:22<00:00,  1.31it/s]
100%|██████████| 6022/6022 [13:47<00:00,  7.28it/s]


Train_loss:0.4995355465780461
valid_loss:0.5018906398385278
