In [3]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import get_linear_schedule_with_warmup
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os
from nltk.corpus import wordnet
from nltk.corpus import stopwords

## Data Loading

In [4]:
!pwd
data_folder_path = './data/'

/mnt/a/OneDrive/UNSW/COMP9444/9444_toxic_comment_classification


In [5]:
df = pd.read_csv(data_folder_path + 'train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
df[df[classes[1]] == 1].sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
106846,3b5ed3ddb511bf4a,You're an 38ass fuck43 \n\n3Go shit your pants43,1,1,1,0,1,0
138822,e6d74f978f2c4e9f,"you are gay \n\nfaggot, trying to block my ass...",1,1,1,0,1,1
112288,58a7701128e55aa9,"I will flay you alive, you fking stalker. \n\n...",1,1,0,1,0,0
13705,24319497c564babc,HOCKEY STICK UP YOUR ASS \n\nHOW DARE YOU ACCU...,1,1,1,0,1,0
141992,f78f2377112245fe,"Um.... yeah, I know it is. Go fuck yourself, ...",1,1,1,0,0,0
123951,97003050132a4384,FUCK YOU DICK LICKS! \n\nYou guys will never b...,1,1,1,0,0,0
82838,dda2fc4a1475fae1,"Stop fucking doing this, you retarded kids, yo...",1,1,1,0,1,0
119822,80cd7801777396bc,DIE IN A CAR CRASH YOU FUCKIN ASSHOLE \n\nTRAI...,1,1,1,0,1,0
11699,1ef097f8808fce67,burn in hell motherfucker,1,1,1,1,1,0
136531,da687d5226bff7f3,SUCK MY COCK D SUCK MY COCK D SUCK MY COCK D S...,1,1,1,0,1,0


each comment can have more than one label.

In [8]:
test_data = pd.read_csv(data_folder_path + 'test.csv')
test_labels = pd.read_csv(data_folder_path + 'test_labels.csv')
test_data = pd.concat([test_data, test_labels], axis=1)

In [9]:
total_samples = df.shape[0]
for cls in classes:
    rate = df[cls].sum() / total_samples
    rate = np.round(rate*100, 3)
    print(cls +' rate: ', rate, "%")

toxic rate:  9.584 %
severe_toxic rate:  1.0 %
obscene rate:  5.295 %
threat rate:  0.3 %
insult rate:  4.936 %
identity_hate rate:  0.88 %


## Data cleaning

In [10]:
def data_cleaning(text):
    # seems that Uppercase words have more effect on toxicity than lowercase.
    # so I decided to keep them as they are.
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("#" , " ")

    text = re.sub('https?://[A-Za-z0-9./]+', '', text)
    text = re.sub('http?://[A-Za-z0-9./]+', '', text)
    text = re.sub('www.[A-Za-z0-9./]+', '', text)
    encoded_string = text.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

In [11]:
df['clean_comment'] = df['comment_text'].apply(data_cleaning)
test_data['clean_comment'] = test_data['comment_text'].apply(data_cleaning)

## Data Augmentation

In [12]:
toxic_df = df[(df['toxic'] == 1) | (df['severe_toxic'] == 1) | (df['obscene'] == 1) | (df['threat'] == 1) | (df['insult'] == 1) | (df['identity_hate'] == 1)]
non_toxic_df = df[(df['toxic'] == 0) & (df['severe_toxic'] == 0) & (df['obscene'] == 0) & (df['threat'] == 0) & (df['insult'] == 0) & (df['identity_hate'] == 0)]
# len(toxic_df), len(non_toxic_df), len(df)

In [13]:

replacement_rate =0.7
##  synonym replacement
aug_toxic_df = toxic_df.copy(True)
for i, row in toxic_df.iterrows():
    comment = row['clean_comment']
    words = comment.split()
    new_comment = ''
    new_words = []
    for word in words:
        if word in stopwords.words('english'):
            new_words.append(word)
            continue

        random_rate = np.random.uniform(0, 1)

        if random_rate < replacement_rate:
            synonyms = []
            for syn in wordnet.synsets(word):
                for l in syn.lemmas():
                    synonyms.append(l.name())
            if len(synonyms) > 0:
                new_word = synonyms[np.random.randint(0, len(synonyms))]
                new_words.append(new_word)
            else:
                new_words.append(word)


        else:
            new_words.append(word)
    new_comment = ' '.join(new_words)
    new_row = row.copy(True)
    new_row['clean_comment'] = new_comment
    aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)

  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxic_df.append(new_row, ignore_index=True)
  aug_toxic_df = aug_toxi

In [14]:
len(aug_toxic_df)

32450

## Tokenization

In [15]:
df = pd.concat([aug_toxic_df, non_toxic_df], ignore_index=True)
seed = 42
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=seed)
valid_df.head()
# len(train_df), len(valid_df)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
128273,3aa02d4d6d40c3fb,I was testing my professor's hypothesis to see...,0,0,0,0,0,0,I was testing my professor's hypothesis to see...
112912,efe66c3a93b3c3ef,"""\n\nFlan etc.\n\nHi, thanks for contributing ...",0,0,0,0,0,0,""" Flan etc. Hi, thanks for contributing to t..."
63154,5b7a4a339249c482,One-time events aren't tenants of a building. ...,0,0,0,0,0,0,One-time events aren't tenants of a building. ...
90188,ac2c81c95d47c67d,Introduction \nI think there is a bit to much ...,0,0,0,0,0,0,Introduction I think there is a bit to much o...
25255,eb2d0d0f621f689a,You base your information on opinion. You're a...,1,0,0,0,1,0,You ground your information on opinion. You're...


In [16]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

In [17]:
# encoded_comment = [tokenizer.encode(sent, add_special_tokens=True) for sent in train_df['clean_comment']]


In [18]:
# comment_len = [len(x) for x in encoded_comment]
# np.max(comment_len), np.quantile(comment_len, 0.97), np.mean(comment_len), np.median(comment_len), np.min(comment_len)

97% of the comments are less than 436 tokens, and longer comments always tend to be non-toxic. so I decided to use max_len = 436.

In [19]:
MAX_LEN = 436

In [20]:
class BertDataSet(Dataset):
    def __init__(self, dataframe):
        self.comments = dataframe['clean_comment'].values
        self.labels = dataframe[classes].to_numpy()

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        tokenized_comment = tokenizer.encode_plus(comment,
                                                    add_special_tokens=True,
                                                    max_length = MAX_LEN,
                                                    padding='max_length',
                                                    truncation = True,
                                                    return_attention_mask = True)
        ids = torch.tensor(tokenized_comment['input_ids'], dtype=torch.long)
        mask = torch.tensor(tokenized_comment['attention_mask'], dtype=torch.long)

        labels = self.labels[idx]
        labels = torch.tensor(labels, dtype=torch.float)
        return {'ids': ids, 'mask': mask, 'labels': labels}


In [21]:
dataset_train = BertDataSet(train_df)
dataset_test = BertDataSet(valid_df)

In [22]:
len(dataset_train), len(dataset_test)

(140636, 35160)

In [23]:
for td in dataset_test:
    print(td['ids'].shape, td['mask'].shape, td['labels'].shape)
    break

torch.Size([436]) torch.Size([436]) torch.Size([6])


In [24]:
train_batch = 1
test_batch = 1

In [25]:
data_loader_train = DataLoader(dataset_train, batch_size=train_batch, shuffle=True, pin_memory = True)
data_loader_test = DataLoader(dataset_test, batch_size=test_batch, shuffle=False, pin_memory = True)

In [26]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 6)
gpus = torch.cuda.device_count()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if gpus > 1:
    print("Let's use", gpus, "GPUs!")
    # model = torch.nn.DataParallel(model)    # multi-gpu
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Let's use 3 GPUs!


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

loss = torch.nn.BCEWithLogitsLoss()
loss.to(device)
for batch in data_loader_train:
    ids = batch['ids'].to(device)
    mask = batch['mask'].to(device)
    outputs = model(ids, attention_mask=mask)
    outputs = outputs['logits'].squeeze(-1).to(torch.float32)
    probabilities = torch.sigmoid(outputs)
    predictions = torch.where(probabilities > 0.5, 1, 0)
    labels = batch['labels'].to(device, non_blocking=True)
    loss_value = loss(outputs, labels)
    print(loss_value.item())
    correct_predictions = torch.sum(predictions == labels)
    print(correct_predictions.item())
    break

In [27]:
loss = torch.nn.BCEWithLogitsLoss(pos_weight = torch.tensor((159571 - 35098) / 35098))
loss.to(device)
epochs = 5
LR = 2e-5 #Learning rate
optimizer = torch.optim.AdamW(model.parameters(), LR, weight_decay = 1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 2, verbose = True)
torch.backends.cudnn.benchmark = True

In [35]:

for i in range(epochs):
    model.train()
    correct_predictions = 0
    for batch_id, batch in enumerate(data_loader_train):
        optimizer.zero_grad()
        train_losses = []
        with torch.cuda.amp.autocast():
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            optimizer.zero_grad()
            outputs = model(ids, mask)

            outputs = outputs['logits'].squeeze(-1).to(torch.float32)
            print('outputs', outputs)
            probabilities = torch.sigmoid(outputs)
            print('probabilities', probabilities.requires_grad)
            predictions = torch.where(probabilities > 0.5, 1, 0)

            print('predictions', predictions.requires_grad)

            labels = batch['labels'].to(device, non_blocking=True)
            print('dtype_labels', labels.dtype)
            print('predictions', predictions)
            print('labels', labels.requires_grad)
            loss_value = loss(outputs, labels)
            print('loss_value', loss_value.grad_fn)
            train_losses.append(loss_value.item())
            loss_value.backward()
            correct_predictions += torch.sum(predictions == labels)
        optimizer.step()
        if batch_id % 10 == 0:
            print('Epoch: {}, Batch: {}, Loss: {}'.format(i, batch_id, np.mean(train_losses)))
    accuracy = correct_predictions/(len(dataset_train)*6)
    print('Epoch: {}, Accuracy: {}'.format(i, accuracy))
    model.eval()
    # test
    with torch.no_grad():
        correct_predictions = 0
        test_losses = []
        for batch_id, batch in enumerate(data_loader_test):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            outputs = model(ids, mask)
            outputs = outputs['logits'].squeeze(-1).to(torch.float32)
            probabilities = torch.sigmoid(outputs)
            predictions = torch.where(probabilities > 0.5, 1, 0)
            labels = batch['labels'].to(device, non_blocking=True)

            loss_valid = loss(outputs, labels)
            test_losses.append(loss_valid.item())
            correct_predictions += torch.sum(predictions == labels)
        accuracy = correct_predictions/(len(dataset_test)*6)
        recall = recall_score(labels, predictions)
        print('Epoch: {}, Recall: {}'.format(i, recall))
        print('Epoch: {}, Validation Accuracy: {}, loss: {}'.format(i, accuracy, np.mean(test_losses)))
    torch.save(model.state_dict(), './model_save/{}.pkl'.format(epochs))

outputs tensor([[-3.7227, -4.6641, -4.3828, -4.3945, -4.0391, -4.2305]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
probabilities True
predictions False
dtype_labels torch.float32
predictions tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0')
labels False
loss_value <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f823dc8fd30>
Epoch: 0, Batch: 0, Loss: 0.01497495174407959
outputs tensor([[-1.7051, -4.7773, -2.8613, -4.8125, -2.7266, -3.8652]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
probabilities True
predictions False
dtype_labels torch.float32
predictions tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0')
labels False
loss_value <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f823d873c70>
outputs tensor([[-3.8535, -4.3906, -4.2578, -4.1328, -3.9297, -4.2852]],
       device='cuda:0', grad_fn=<ToCopyBackward0>)
probabilities True
predictions False
dtype_labels torch.float32
predictions tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0')
labels False
loss_value <BinaryCrossEn

KeyboardInterrupt: 

In [None]:
print(torch.cuda.memory_summary(1))