In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
PATH = "/content/gdrive/My Drive/vdcnn_testing/"

In [0]:
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.4.1-cp36-cp36m-linux_x86_64.whl
!pip3 install torchtext
!pip3 install pyprind



In [0]:
# importing required libraries
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import datasets
from torchtext import data
import nltk
import pyprind
import math

import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from sklearn.metrics import roc_auc_score

In [0]:
class ConvolutionalBlockRes(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding=1, pool_type="max_pool"):
        super().__init__()
        self.pool_type = pool_type
        self.conv_1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=1)
        self.batch_norm_1 = nn.BatchNorm1d(out_channels)
        self.conv_2 = nn.Conv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, padding=1)
        self.batch_norm_2 = nn.BatchNorm1d(out_channels)

    
    def forward(self, x):
        out = self.conv_1(x)
        out= F.relu(self.batch_norm_1(out))
        out = self.conv_2(out)
        out = F.relu(self.batch_norm_2(out))
        out = downsample_max_pool(out, 3, 2)
        return out

def downsample_max_pool(x, kernel_size, stride):
    pool = nn.MaxPool1d(kernel_size=kernel_size, stride=stride, padding=1)
    return pool(x)


def downsample_k_max_pool(inp, k, dim):
    return inp.topk(k, dim)

class VDCNN(nn.Module):
    def __init__(self, embedding_dim, vocab_size, n_classes):
        super().__init__()
        self.embedding = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size)
        self.batch_norm_emb = nn.BatchNorm1d(embedding_dim)
        
        self.conv_64 = nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3, padding=1)
        self.batch_norm_conv_64 = nn.BatchNorm1d(64)
        
        self.res_64 = ConvolutionalBlockRes(in_channels=64, out_channels=64, kernel_size=3, padding=1, pool_type="max_pool")
        self.res_128 = ConvolutionalBlockRes(in_channels=64, out_channels=128, kernel_size=3, padding=1, pool_type="max_pool")
        self.res_256 = ConvolutionalBlockRes(in_channels=128, out_channels=256, kernel_size=3, padding=1, pool_type="max_pool")
        self.res_512 = ConvolutionalBlockRes(in_channels=256, out_channels=512, kernel_size=3, padding=1, pool_type="max_pool")
        
        self.linear_1 = nn.Linear(3*512, 512)
        self.batch_norm_l1 = nn.BatchNorm1d(512)
        self.drop1 = nn.Dropout(0.3)
        
        self.linear_2 = nn.Linear(512, 512)
        self.batch_norm_l2 = nn.BatchNorm1d(512)
        self.drop2 = nn.Dropout(0.3)
        
        self.linear_3 = nn.Linear(512, n_classes)
        
    def forward(self, inp):
        # [batch_size, sent_length]
        embedded = self.embedding(inp)
#         print(embedded.shape)
        
        # [batch_size, sent_lenght, emb_dim]
        embedded = embedded.permute(0, 2, 1)
#         print(embedded.shape)
        
         # batchnorming embeddings
        embedded = self.batch_norm_emb(embedded)
        
        # [batch_size, emb_dim, sent_length]
        out = F.relu(self.batch_norm_conv_64(self.conv_64(embedded)))
#         print(out.shape)
        
        # [batch_size, 64, sent_length]
        out = self.res_64(out)
#         print(out.shape)
        
        # [batch_size, 128, sent_length/2]
        out = self.res_128(out)
#         print(out.shape)
    
        # [batch_size, 256, sent_length/4]
        out = self.res_256(out)
#         print(out.shape)

        # [batch_size, 512, sent_length/8]
        out = self.res_512(out)
#         print(out.shape)
        
#         # [batch_size, 512, sent_length/8]
        out = downsample_k_max_pool(out, k=3, dim=2)[0]
#         print(out.shape)
        
        out = out.reshape(out.shape[0], -1)
#         print(out.shape)
        
        # [batch_size, 512, 3]
        out = F.relu(self.batch_norm_l1(self.linear_1(out)))
        out = self.drop1(out)
#         print(out.shape)

        # [batch_size, 512*3]
        out = F.relu(self.batch_norm_l2(self.linear_2(out)))
        out = self.drop2(out)
#         print(out.shape)

        # [batch_size, 512*3]
        out = self.linear_3(out)
#         print(out.shape)
        
        # [batch_size, n_class]
        return out

In [0]:
SENT_LENGTH = 1024

In [0]:
def tokenizer(text):
    s = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|_#$%ˆ&*˜‘+=<>()[]{} '
    return [l for l in list(text.lower()) if l in s]

In [0]:
text_field = data.Field(
    sequential=True,
    use_vocab=True,
#     init_token="<ios>",
#     eos_token="<eos>",
    fix_length=SENT_LENGTH,
    tokenize=tokenizer,
    batch_first=True
)
label_field = data.Field(
    sequential=False,
    use_vocab=False,
    is_target=True,
    dtype=torch.float
)

In [0]:
csv_fields = [
    ("id", None),
    ("comment_text", text_field),
    ("toxic", label_field),
    ("severe_toxic", None), ("threat", None),
    ("obscene", None), ("insult", None),
    ("identity_hate", None)
]

In [0]:
trainds, valds = data.TabularDataset.splits(
    path=PATH + "data/",
    format="csv",
    train="train_torch.csv",
    validation="test_torch.csv",
    fields=csv_fields,
    skip_header=True
)

In [0]:
text_field.build_vocab(trainds)
label_field.build_vocab(trainds)

In [0]:
BATCH_SIZE = 128

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

traindl, valdl = data.BucketIterator.splits(
    datasets=(trainds, valds),
    batch_sizes=(BATCH_SIZE, BATCH_SIZE),
    sort_key= lambda x: x.comment_text,
    repeat=False,
    device=device
)
len(traindl), len(valdl)

cuda


(1122, 125)

In [0]:

class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            y = getattr(batch, self.y_vars)
#             if self.y_vars is not None: # we will concatenate y into a single tensor
#                 y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
#             else:
#                 y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [0]:
train_dl = BatchWrapper(traindl, "comment_text", "toxic")
valid_dl = BatchWrapper(valdl, "comment_text", "toxic")

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = (preds == y).float()
    acc = correct.sum()/float(len(correct))
    return acc

def roc_auc_score_FIXED(y_true, y_pred):
    if len(np.unique(y_true)) == 1: # bug in roc_auc_score
        return 0.5
    return roc_auc_score(y_true, y_pred)

def get_avg_roc_value(y, output):
    out = torch.sigmoid(output)
    out = out.cpu().detach().numpy()
    y = y.cpu().detach().numpy()
    
    roc = roc_auc_score_FIXED(y, out)
    return roc

def get_avg_roc_value_2(y_fin, output_fin):
    n = len(y_fin)
    out_list = []
    y_list = []
    for i in range(n):
        out_list.extend(list(output_fin[i]))
        y_list.extend(list(y_fin[i]))
            
    roc = roc_auc_score_FIXED(y_list, out_list)

    return roc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_roc = 0
    all_y = []
    all_out_list = []
    
    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for x, y in iterator:
        optimizer.zero_grad()
        outputs = model(x).squeeze(1)
        loss = criterion(outputs, y)
        acc = binary_accuracy(outputs, y)
        roc = get_avg_roc_value(y, outputs)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_roc += roc
        
        all_out_list.append(torch.sigmoid(outputs).cpu().detach().numpy())
        all_y.append(y.cpu().detach().numpy())
        
        bar.update()
    roc_main = get_avg_roc_value_2(all_y, all_out_list)
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_roc / len(iterator), roc_main

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_roc = 0
    
    all_y = []
    all_out_list = []
    model.eval()
    
    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for x, y in iterator:
            outputs = model(x).squeeze(1)
            loss = criterion(outputs, y)
            acc = binary_accuracy(outputs, y)
            roc = get_avg_roc_value(y, outputs)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_roc += roc
            
            all_out_list.append(torch.sigmoid(outputs).cpu().detach().numpy())
            all_y.append(y.cpu().detach().numpy())
            
            bar.update()
    roc_main = get_avg_roc_value_2(all_y, all_out_list)
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_roc / len(iterator), roc_main

In [0]:
embedding_dim = 16
vocab_size = len(text_field.vocab.stoi)
n_classes = 1

model = VDCNN(embedding_dim, vocab_size, n_classes)

In [0]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Number of trainable parameters in the model are : {}".format(params))

Number of trainable parameters in the model are : 2635169


In [0]:
model

VDCNN(
  (embedding): Embedding(68, 16)
  (batch_norm_emb): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_64): Conv1d(16, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (batch_norm_conv_64): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (res_64): ConvolutionalBlockRes(
    (conv_1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (batch_norm_1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (batch_norm_2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (res_128): ConvolutionalBlockRes(
    (conv_1): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (batch_norm_1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (bat

In [0]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [0]:
MODEL_PATH =PATH + "data/vdcnn_just_toxic_model_mukesh.tar"
def save_checkpoint(state, is_best, filename):
    """Save checkpoint if a new best is achieved"""
    if is_best:
        print ("=> Saving a new best")
        torch.save(state, filename)  # save checkpoint
    else:
        print ("=> Validation roc did not improve")
    return

def load_check_point(model, model_path):
    resume_weights = model_path
    checkpoint = torch.load(resume_weights)
    start_epoch = checkpoint['epoch']
    best_accuracy = checkpoint['best_dev_accuracy']
    model.load_state_dict(checkpoint['state_dict'])
    print("Best Dev Accuracy is {}".format(best_accuracy))
    print("=> loaded checkpoint '{}' (trained for {} epochs)".format(resume_weights, checkpoint['epoch']))
    return model

model = load_check_point(model, MODEL_PATH)

Best Dev Accuracy is 0.9618139533996583
=> loaded checkpoint '/content/gdrive/My Drive/vdcnn_testing/data/vdcnn_just_toxic_model_mukesh.tar' (trained for 9 epochs)


In [0]:
N_EPOCHS = 20
base_dev_roc = 0.9745
for epoch in range(N_EPOCHS):

    train_loss, train_acc, train_roc, train_roc_main = train(model, train_dl, optimizer, criterion)
    valid_loss, valid_acc, valid_roc, valid_roc_main = evaluate(model, valid_dl, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train ROC: {train_roc*100:.2f} | Train Acc: {train_acc*100:.2f}%')
    print(f'| Epoch: {epoch+1:02} | Val. Loss: {valid_loss:.3f} | Val. ROC: {valid_roc*100:.2f} | Val. Acc: {valid_acc*100:.2f}% |')
    print(f'| Train Main ROC: {train_roc_main*100:.2f} | Val. Main ROC: {valid_roc_main*100:.2f} ')
    is_best = False
    if base_dev_roc < valid_roc_main:
        is_best = True,
        base_dev_roc = valid_roc_main
    
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_loss': valid_loss,
        'best_dev_accuracy': valid_acc
    }, is_best, MODEL_PATH)

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:09:29
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 01 | Train Loss: 0.100 | Train ROC: 97.77 | Train Acc: 96.32%
| Epoch: 01 | Val. Loss: 0.101 | Val. ROC: 97.02 | Val. Acc: 96.30% |
| Train Main ROC: 97.59 | Val. Main ROC: 97.43 
=> Validation roc did not improve



Total time elapsed: 00:00:28
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:12:09
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 02 | Train Loss: 0.096 | Train ROC: 98.02 | Train Acc: 96.40%
| Epoch: 02 | Val. Loss: 0.105 | Val. ROC: 96.82 | Val. Acc: 96.09% |
| Train Main ROC: 97.83 | Val. Main ROC: 97.47 
=> Saving a new best



Total time elapsed: 00:00:29
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:53
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 03 | Train Loss: 0.094 | Train ROC: 98.16 | Train Acc: 96.51%
| Epoch: 03 | Val. Loss: 0.103 | Val. ROC: 96.56 | Val. Acc: 96.19% |
| Train Main ROC: 97.97 | Val. Main ROC: 97.27 
=> Validation roc did not improve



Total time elapsed: 00:00:29
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:55
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 04 | Train Loss: 0.092 | Train ROC: 98.19 | Train Acc: 96.53%
| Epoch: 04 | Val. Loss: 0.105 | Val. ROC: 97.11 | Val. Acc: 96.09% |
| Train Main ROC: 98.03 | Val. Main ROC: 97.56 
=> Saving a new best



Total time elapsed: 00:00:29
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:40
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 05 | Train Loss: 0.091 | Train ROC: 98.32 | Train Acc: 96.56%
| Epoch: 05 | Val. Loss: 0.112 | Val. ROC: 96.50 | Val. Acc: 96.11% |
| Train Main ROC: 98.16 | Val. Main ROC: 97.15 
=> Validation roc did not improve



Total time elapsed: 00:00:27
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:49
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 06 | Train Loss: 0.089 | Train ROC: 98.40 | Train Acc: 96.63%
| Epoch: 06 | Val. Loss: 0.105 | Val. ROC: 96.61 | Val. Acc: 96.24% |
| Train Main ROC: 98.24 | Val. Main ROC: 97.42 
=> Validation roc did not improve



Total time elapsed: 00:00:28
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:47
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 07 | Train Loss: 0.088 | Train ROC: 98.47 | Train Acc: 96.60%
| Epoch: 07 | Val. Loss: 0.105 | Val. ROC: 96.95 | Val. Acc: 96.21% |
| Train Main ROC: 98.32 | Val. Main ROC: 97.45 
=> Validation roc did not improve



Total time elapsed: 00:00:29
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:11:34
0% [██████████████████████████████] 100% | ETA: 00:00:00

| Epoch: 08 | Train Loss: 0.086 | Train ROC: 98.53 | Train Acc: 96.72%
| Epoch: 08 | Val. Loss: 0.106 | Val. ROC: 96.41 | Val. Acc: 96.26% |
| Train Main ROC: 98.38 | Val. Main ROC: 97.20 
=> Validation roc did not improve



Total time elapsed: 00:00:29
0% [██████████████                ] 100% | ETA: 00:06:14

In [0]:
output.shape