In [1]:
from transformers import AutoTokenizer, AutoModel, AdamW

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

import gc

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Prepare the data

In [2]:
df = pd.read_csv("data/validation_data.csv")
df.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


## Get unique messages

In [3]:
messages = list(set(df.less_toxic.unique()).union(set(df.more_toxic.unique())))
print(f"Number of samples in df: {len(df)}\nNumber of unique messages: {len(messages)}")

#create an index of messages:
index = {messages[i]:i for i in range(len(messages))}

#Add a filter column on the dataset based on those index
df["words_as_index"] = df.apply(lambda x: [index[x.less_toxic],index[x.more_toxic]],axis=1)

#split based on unique messages
messages_index = list(range(len(messages)))
np.random.shuffle(messages_index)

word_train_frac = 0.9 
n = len(messages_index)

test_words = messages_index[int(n*word_train_frac):]

df["folds"] = df["words_as_index"].apply(lambda x: 0 if any([e in test_words for e in x]) else 1)

#Split dataset to evaluate the model
train_df = df[df.folds == 1]
test_messages = messages[int(n*word_train_frac):]

Number of samples in df: 30108
Number of unique messages: 14251


# Prepare NN model

## Load the tokenizer and model from huggingface

In [4]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModel.from_pretrained('roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare the Custom Pytorch Module

In [5]:
last_hidden_layer_size = 768
final_node_size = 1

class ToxicRankModel(nn.Module):

    def __init__(self, model, last_hidden_layer_size):
        super(ToxicRankModel, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(p=0.2)
        self.rank_head = nn.Linear(last_hidden_layer_size, 1)
        
    def forward(self, ids, mask):        
        output = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        output = self.dropout(output[1])
        score= self.rank_head(output)
        return score
    
toxicRankModel = ToxicRankModel(model, last_hidden_layer_size)

## Prepare the dataset and dataloader

In [6]:
class CustomDataset(Dataset):
    def __init__(self, train_df, tokenizer, max_length):

        #token list standard size
        self.length = max_length
        
        #Here the tokenizer will be an instance of the tokenizer
        #shown previously
        self.tokenizer = tokenizer
      
        #df is the training df shown in the beginning of the article
        self.more_toxic = train_df['more_toxic'].values
        self.less_toxic = train_df['less_toxic'].values
        
    def __len__(self):
        return len(self.more_toxic)
    
    def __getitem__(self, i):
        # get both messages at index i
        message_more_toxic = self.more_toxic[i]
        message_less_toxic = self.less_toxic[i]
        
        #tokenize the messages
        dic_more_toxic = self.tokenizer.encode_plus(
                                message_more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.length,
                                padding='max_length'
                            )
        dic_less_toxic = self.tokenizer.encode_plus(
                                message_less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.length,
                                padding='max_length'
                            )
        
        #extract tokens and masks
        tokens_more_toxic = dic_more_toxic['input_ids']
        mask_more_toxic = dic_more_toxic['attention_mask']
        
        tokens_less_toxic = dic_less_toxic['input_ids']
        mask_less_toxic = dic_less_toxic['attention_mask']
        
        #return a dictionnary of tensors
        return {
            'tokens_more_toxic': torch.tensor(tokens_more_toxic, dtype=torch.long),
            'mask_more_toxic': torch.tensor(mask_more_toxic, dtype=torch.long),
            'tokens_less_toxic': torch.tensor(tokens_less_toxic, dtype=torch.long),
            'mask_less_toxic': torch.tensor(mask_less_toxic, dtype=torch.long),
        }

In [9]:
def get_loader(df, tokenizer, max_length, batch_size):

    dataset = CustomDataset(
        df, 
        tokenizer=tokenizer, 
        max_length=max_length
    )

    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=True,
        drop_last=True)

max_length = 128
train_loader = get_loader(train_df, tokenizer, max_length, batch_size=32)

## Prepare loss function

In [8]:
from torch.nn import MarginRankingLoss

#Custom implementation of the MarginRankingLoss with y = 1
class CustomMarginRankingLoss(nn.Module):
    def __init__(self, margin=0):
        super(CustomMarginRankingLoss, self).__init__()
        self.margin = margin

    def forward(self, x1, x2):
        loss = torch.relu(x2 - x1 + self.margin)
        return loss.mean()
    
def criterion(x1, x2):
    return CustomMarginRankingLoss(margin=0.5)(x1, x2)

## Prepare training routine

In [10]:
from tqdm import tqdm

def train_one_epoch(model, optimizer, scheduler, dataloader, device):
    #Setup train mode
    model.train()
    model.to(device)
    
    dataset_size = 0
    running_loss = 0.0
    running_accuracy = 0.0
    
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training")
    
    for i, data in progress_bar:
        more_toxic_ids = data['tokens_more_toxic'].to(device, dtype = torch.long)
        more_toxic_mask = data['mask_more_toxic'].to(device, dtype = torch.long)
        less_toxic_ids = data['tokens_less_toxic'].to(device, dtype = torch.long)
        less_toxic_mask = data['mask_less_toxic'].to(device, dtype = torch.long)
        
        batch_size = more_toxic_ids.size(0)

        #Forward pass both inputs in the model
        x1 = model(more_toxic_ids, more_toxic_mask)
        x2 = model(less_toxic_ids, less_toxic_mask)
        
        #Compute margin ranking loss
        loss = criterion(x1, x2)
        accuracy_measure = (x1 > x2).float().mean().item()
        
        #apply backpropagation, increment optimizer
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        optimizer.zero_grad()
        #Update cumulative loss for monitoring
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        running_accuracy += (accuracy_measure * batch_size)
        epoch_accuracy = running_accuracy / dataset_size

        progress_bar.set_postfix({'loss': epoch_loss, 'accuracy': epoch_accuracy}, refresh=True)        
        
    #Garbage collector
    gc.collect()
    
    return epoch_loss

#Get to work of GPU if available else CPU
optimizer_lr = 1e-4
optimizer_weight_decay = 1e-6
scheduler_T_max = 500
scheduler_eta_min = 1e-6
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

optimizer = AdamW(toxicRankModel.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=scheduler_T_max, eta_min=scheduler_eta_min)

for i in range(3):
    train_one_epoch(toxicRankModel, optimizer, scheduler, train_loader, device)

Training: 100%|██████████| 758/758 [15:54<00:00,  1.26s/it, loss=0.367, accuracy=0.673]
Training: 100%|██████████| 758/758 [15:54<00:00,  1.26s/it, loss=0.355, accuracy=0.689]
Training: 100%|██████████| 758/758 [15:52<00:00,  1.26s/it, loss=0.333, accuracy=0.715]


# Inference

In [11]:
class CustomInferenceDataset(Dataset):
    def __init__(self, messages, tokenizer, max_length):

        #token list standard size
        self.length = max_length
        
        #Here the tokenizer will be an instance of the tokenizer
        #shown previously
        self.tokenizer = tokenizer
      
        #df is the training df shown in the beginning of the article
        self.messages = messages

        
    def __len__(self):
        return len(self.messages)
    
    def __getitem__(self, i):
        # get both messages at index i
        message = self.messages[i]

        #tokenize the messages
        dic_messages = self.tokenizer.encode_plus(
                                message,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.length,
                                padding='max_length'
                            )

        #extract tokens and masks
        tokens_message = dic_messages['input_ids']
        mask_message = dic_messages['attention_mask']
        
        #return a dictionnary of tensors
        return {
            'tokens_message': torch.tensor(tokens_message, dtype=torch.long),
            'mask_message': torch.tensor(mask_message, dtype=torch.long),
        }

In [48]:
def get_loader_inference(messages, tokenizer, max_length, batch_size):

    dataset = CustomInferenceDataset(
        messages, 
        tokenizer=tokenizer, 
        max_length=max_length
    )

    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=False,
        drop_last=False)

test_messages = pd.read_csv('data/comments_to_score.csv').text.values
max_length = 128
test_loader = get_loader_inference(test_messages, tokenizer, max_length, 64)

In [49]:
@torch.no_grad()
def get_scores(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    model.to(device)
    ranks = []  # List to store the rank scores
    
    progress_bar = tqdm(enumerate(test_loader), total=len(test_loader), desc="Scoring")
    

    for i, data in progress_bar:
        tokens_message = data['tokens_message'].to(device, dtype=torch.long)
        mask_message = data['mask_message'].to(device, dtype=torch.long)

        # Forward pass to get the rank scores
        rank = model(tokens_message, mask_message)
        # Convert tensor to NumPy and add to the list
        ranks+=list(rank.cpu().numpy().flatten())
    
    return ranks

In [93]:
scores = get_scores(toxicRankModel, test_loader, device)
mapped_score = pd.Series({mess:score for mess,score in zip(test_messages, scores)}).reset_index().sort_values(0, ascending= False).rename(columns = {"index":"text",0:"score"})
mapped_score["text"] = mapped_score["text"].apply(lambda x:x.replace("FUCK","F***").replace("COCKSUCKER","C*********").replace("SUCK","S***").replace("FUKIN","F****"))
texts = mapped_score.drop([4757,4758]).head(5)
for idx, message in texts.iterrows():
    print(f"toxicity score:{np.round(message.score,3)}\n", message.text[:60]+'...\n\n')

Scoring: 100%|██████████| 118/118 [00:52<00:00,  2.25it/s]

toxicity score:1.186
 F*** YOU WIKIPEIA YOU CANT STOP US!!!!!!!!!!!!!!!!
F***
F***...


toxicity score:1.184
 F*** YOU U USELESS BOT F*** YOU U USELESS BOT F*** YOU U USE...


toxicity score:1.183
 U S***!!!!
U S***!!!!'U S***!!!!U S***!!!!'U S***!!!!
U S***...


toxicity score:1.182
 YOU ARE A MOTHJER F***ER C*********! YOU ARE A MOTHJER F***E...


toxicity score:1.182
 LEAVE 4 A F**** SNAK AND I RETRN WITH A F**** BLCK YOU ARE A...





