In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import get_linear_schedule_with_warmup
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os

## Data Loading

In [2]:
!pwd
data_folder_path = '//mnt/a/OneDrive/UNSW/COMP9444/9444_toxic_comment_classification/data/'

/home/yiwei


In [3]:
df = pd.read_csv(data_folder_path + 'train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
df[df[classes[1]] == 1].sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
128869,b1271c212b64ca18,Gay ass william at it again small penis fagbag...,1,1,1,0,1,0
18100,2fd54fdafc99da95,You swine. You vulgar little maggot. You worth...,1,1,1,0,1,0
52394,8c35385171575379,You are a fucking dick pseudomonas \n\nyou do ...,1,1,1,0,0,0
29445,4e17edc5e38b8335,"If you don't let me post, I will fuck you up",1,1,1,1,0,0
89046,ee32164a4d102aa7,"""\n\nFuck off you stupid fucking retard cunt.\...",1,1,1,0,1,0
1465,03ebb03deb920216,To Blanchardb \n\nYou dumb motherfucker I am g...,1,1,1,0,1,0
92746,f7fd19e7a6ea2209,"""How dare you filthy stinking whores disrupt m...",1,1,0,0,0,0
84948,e3591ced175a97d6,Bottomley \n\nIt's not an attack page you dick...,1,1,1,0,1,0
154553,af9e849c953035b1,You're gay you're gay you're gay you're gay yo...,1,1,0,0,1,1
48618,820493434cb9153e,"ASSHOLE! \n\nI just read this article, and man...",1,1,1,0,1,0


each comment can have more than one label.

In [6]:
test_data = pd.read_csv(data_folder_path + 'test.csv')
test_labels = pd.read_csv(data_folder_path + 'test_labels.csv')
test_data = pd.concat([test_data, test_labels], axis=1)

In [7]:
total_samples = df.shape[0]
for cls in classes:
    rate = df[cls].sum() / total_samples
    rate = np.round(rate*100, 3)
    print(cls +' rate: ', rate, "%")

toxic rate:  9.584 %
severe_toxic rate:  1.0 %
obscene rate:  5.295 %
threat rate:  0.3 %
insult rate:  4.936 %
identity_hate rate:  0.88 %


## Data cleaning

In [8]:
def data_cleaning(text):
    # seems that Uppercase words have more effect on toxicity than lowercase.
    # so I decided to keep them as they are.
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("#" , " ")

    text = re.sub('https?://[A-Za-z0-9./]+', '', text)
    text = re.sub('http?://[A-Za-z0-9./]+', '', text)
    text = re.sub('www.[A-Za-z0-9./]+', '', text)
    encoded_string = text.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

In [9]:
df['clean_comment'] = df['comment_text'].apply(data_cleaning)
test_data['clean_comment'] = test_data['comment_text'].apply(data_cleaning)

## Tokenization

In [10]:
seed = 42
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=seed)
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,Grandma Terri Should Burn in Trash Grandma Te...
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,", 9 May 2009 (UTC) It would be easiest if you ..."
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,""" The Objectivity of this Discussion is doubt..."
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,Shelly Shock Shelly Shock is. . .( )
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,I do not care. Refer to Ong Teng Cheong talk p...


In [11]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

In [12]:
# encoded_comment = [tokenizer.encode(sent, add_special_tokens=True) for sent in train_df['clean_comment']]


In [13]:
# comment_len = [len(x) for x in encoded_comment]
# np.max(comment_len), np.quantile(comment_len, 0.97), np.mean(comment_len), np.median(comment_len), np.min(comment_len)

97% of the comments are less than 436 tokens, and longer comments always tend to be non-toxic. so I decided to use max_len = 436.

In [14]:
MAX_LEN = 436

In [15]:
class BertDataSet(Dataset):
    def __init__(self, dataframe):
        self.comments = dataframe['clean_comment'].values
        self.labels = dataframe[classes].to_numpy()

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        tokenized_comment = tokenizer.encode_plus(comment,
                                                    add_special_tokens=True,
                                                    max_length = MAX_LEN,
                                                    padding='max_length',
                                                    truncation = True,
                                                    return_attention_mask = True)
        ids = torch.tensor(tokenized_comment['input_ids'], dtype=torch.long)
        mask = torch.tensor(tokenized_comment['attention_mask'], dtype=torch.long)

        labels = self.labels[idx]
        labels = torch.tensor(labels, dtype=torch.float)
        return {'ids': ids, 'mask': mask, 'labels': labels}


In [16]:
dataset_train = BertDataSet(train_df)
dataset_test = BertDataSet(valid_df)

In [17]:
len(dataset_train), len(dataset_test)

(2000, 1000)

In [18]:
for td in dataset_test:
    print(td['ids'].shape, td['mask'].shape, td['labels'].shape)
    break

torch.Size([436]) torch.Size([436]) torch.Size([6])


In [19]:
train_batch = 48
test_batch = 48

In [20]:
data_loader_train = DataLoader(dataset_train, batch_size=train_batch, shuffle=True, pin_memory = True)
data_loader_test = DataLoader(dataset_test, batch_size=test_batch, shuffle=False, pin_memory = True)

In [21]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 6)
gpus = torch.cuda.device_count()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if gpus > 1:
    print("Let's use", gpus, "GPUs!")
    model = torch.nn.DataParallel(model)    # multi-gpu
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Let's use 3 GPUs!


DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

loss = torch.nn.BCEWithLogitsLoss()
loss.to(device)
for batch in data_loader_train:
    ids = batch['ids'].to(device)
    mask = batch['mask'].to(device)
    outputs = model(ids, attention_mask=mask)
    outputs = outputs['logits'].squeeze(-1).to(torch.float32)
    probabilities = torch.sigmoid(outputs)
    predictions = torch.where(probabilities > 0.5, 1, 0)
    labels = batch['labels'].to(device, non_blocking=True)
    loss_value = loss(outputs, labels)
    print(loss_value.item())
    correct_predictions = torch.sum(predictions == labels)
    print(correct_predictions.item())
    break

In [22]:
loss = torch.nn.BCEWithLogitsLoss()
loss.to(device)
epochs = 5
LR = 2e-5 #Learning rate
optimizer = torch.optim.AdamW(model.parameters(), LR, weight_decay = 1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 2, verbose = True)
torch.backends.cudnn.benchmark = True

In [23]:

for i in range(epochs):
    model.train()
    correct_predictions = 0
    for batch_id, batch in enumerate(data_loader_train):
        optimizer.zero_grad()
        train_losses = []
        with torch.cuda.amp.autocast():
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            optimizer.zero_grad()
            outputs = model(ids, mask)
            outputs = outputs['logits'].squeeze(-1).to(torch.float32)
            probabilities = torch.sigmoid(outputs)
            predictions = torch.where(probabilities > 0.5, 1, 0)
            labels = batch['labels'].to(device, non_blocking=True)
            loss_value = loss(outputs, labels)
            train_losses.append(loss_value.item())
            loss_value.backward()
            correct_predictions += torch.sum(predictions == labels)
        optimizer.step()
        if batch_id % 10 == 0:
            print('Epoch: {}, Batch: {}, Loss: {}'.format(i, batch_id, np.mean(train_losses)))
    accuracy = correct_predictions/(len(dataset_train)*6)
    print('Epoch: {}, Accuracy: {}'.format(i, accuracy))
    model.eval()
    # test
    with torch.no_grad():
        correct_predictions = 0
        test_losses = []
        for batch_id, batch in enumerate(data_loader_test):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            outputs = model(ids, mask)
            outputs = outputs['logits'].squeeze(-1).to(torch.float32)
            probabilities = torch.sigmoid(outputs)
            predictions = torch.where(probabilities > 0.5, 1, 0)
            labels = batch['labels'].to(device, non_blocking=True)
            loss_valid = loss(outputs, labels)
            test_losses.append(loss_valid.item())
            correct_predictions += torch.sum(predictions == labels)
        accuracy = correct_predictions/(len(dataset_test)*6)
        print('Epoch: {}, Validation Accuracy: {}, loss: {}'.format(i, accuracy, np.mean(test_losses)))
        if accuracy > 0.97:
            break

Epoch: 0, Batch: 0, Loss: 0.8139429092407227
Epoch: 0, Batch: 10, Loss: 0.520084798336029
Epoch: 0, Batch: 20, Loss: 0.33398282527923584
Epoch: 0, Batch: 30, Loss: 0.1946413218975067
Epoch: 0, Batch: 40, Loss: 0.19009457528591156
Epoch: 0, Accuracy: 0.8760833144187927
Epoch: 0, Validation Accuracy: 0.9678333401679993, loss: 0.15576065154302687
Epoch: 1, Batch: 0, Loss: 0.1112305074930191
Epoch: 1, Batch: 10, Loss: 0.213715597987175
Epoch: 1, Batch: 20, Loss: 0.16956229507923126
Epoch: 1, Batch: 30, Loss: 0.1453367918729782
Epoch: 1, Batch: 40, Loss: 0.06897848099470139
Epoch: 1, Accuracy: 0.9604166746139526
Epoch: 1, Validation Accuracy: 0.9678333401679993, loss: 0.10331601010901588
Epoch: 2, Batch: 0, Loss: 0.16576775908470154
Epoch: 2, Batch: 10, Loss: 0.08768464624881744
Epoch: 2, Batch: 20, Loss: 0.07879272103309631
Epoch: 2, Batch: 30, Loss: 0.08739092946052551
Epoch: 2, Batch: 40, Loss: 0.06391213089227676
Epoch: 2, Accuracy: 0.9704999923706055
Epoch: 2, Validation Accuracy: 0.97

In [24]:
print(torch.cuda.memory_summary(1))

|                  PyTorch CUDA memory summary, device ID 1                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |    6352 MB |    4633 GB |    4633 GB |
|       from large pool |       0 B  |    6351 MB |    4619 GB |    4619 GB |
|       from small pool |       0 B  |       2 MB |      14 GB |      14 GB |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |    6352 MB |    4633 GB |    4633 GB |
|       from large pool |       0 B  |    6351 MB |    4619 GB |    4619 GB |
|       from small pool |       0 B  |       2 MB |      14 GB |      14 GB |
|---------------------------------------------------------------