In [51]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
!pip install -q transformers > /dev/null
!pip install -q clean-text > /dev/null
!pip install torch-summary > /dev/null

In [10]:
import pandas as pd, numpy as np
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.optim import Adam
from cleantext import clean
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sklearn.metrics

**Load the training dataset in a pandas dataframe**

In [54]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
train = pd.read_csv(train_path)

**Analyse the dataset**

In [52]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [12]:
train.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

**Define the output classes**

In [13]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[classes].values

**Perform dataset cleaning such as removing stop words, emojis, punctuations etc. using clean-text library**

In [14]:
def clean_string(s): return clean(s, no_line_breaks=True, no_urls=True, no_punct=True)

**Custom Dataset loader**

In [15]:
class MyData(torch.utils.data.Dataset):
    def __init__(self, data, label_cols):
        self.data = data
        self.label_cols = label_cols

    def __getitem__(self, item):
        comment = clean_string(self.data.comment_text[item])
        toxic = self.data.toxic[item]
        severe_toxic = self.data.severe_toxic[item]
        obscene = self.data.obscene[item]
        threat = self.data.threat[item]
        insult = self.data.insult[item]
        identity_hate = self.data.identity_hate[item]
#         return comment, torch.FloatTensor([toxic, severe_toxic, obscene, threat, insult, identity_hate])
        return comment, torch.Tensor([toxic, severe_toxic, obscene, threat, insult, identity_hate])
    
    def __len__(self):
        return len(self.data)

In [16]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# train['none'] = 1-train[label_cols].max(axis=1)
# label_cols.append('none')
COMMENT = 'comment_text'
label_cols.append(COMMENT)

**Set device for PyTorch Training**

In [17]:
gpu = 0
device = torch.device(gpu if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(gpu)
print(device)

cuda:0


**Model parameters and hyper-parameters**

In [18]:
MAX_SEQ_LEN = 128
BATCH_SIZE = 64*2
WARMUP_STEPS = 4
EPOCHS = 1
LEARNING_RATE = 5e-3
model_name = 'bert-base-uncased' #'google/electra-small-discriminator' 'gpt2'
lstm_units = 50

**Define the tokenizer** 

This is dependent on the transformer model passed as input in model_name. Also specify the max sentence length after tokenizing.


In [19]:
import transformers
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=MAX_SEQ_LEN, do_lower_case=True, add_special_tokens=True,
                                                max_length=MAX_SEQ_LEN, pad_to_max_length=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




0

**Load the training data into torch Dataset and perform train-validation set split.**

In [20]:
train_set = MyData(train, label_cols)
train_set, val_set = torch.utils.data.random_split(train_set, [int(0.9*len(train_set)), len(train_set)-int(0.9*len(train_set))] )
train_set = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_set = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

**Define custom nn model**

In [21]:
class AutoNet(nn.Module):
  def __init__(self, seqLength=MAX_SEQ_LEN, numClasses=6, model_name=model_name, lstm_units=50):
    super(AutoNet, self).__init__()
    self.config = AutoConfig.from_pretrained(model_name, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)
    self.config.output_hidden_states = False
    self.model = AutoModel.from_pretrained(model_name, config=self.config)
    for param in self.model.parameters():
      param.requires_grad=False
    if model_name=='google/electra-small-discriminator':
        self.lstm = nn.LSTM(256, lstm_units, dropout=0.1,
                        num_layers=1, bidirectional=True, batch_first=True, bias=False)
    else:
        self.lstm = nn.LSTM(768, lstm_units, dropout=0.1,
                        num_layers=1, bidirectional=True, batch_first=True)
    self.l1 = nn.Linear(lstm_units * 2, 50)
    self.d1 = nn.Dropout(0.2)
    self.l2 = nn.Linear(50, numClasses)
    self.sigmoid = nn.Sigmoid()

  def forward(self,input_ids, attention_mask):
    x = self.model(input_ids, attention_mask=attention_mask)[0]
    x, (hidden, cell) = self.lstm(x)
    x,_ = torch.max(x, dim=1)
    x = self.l1(x)
    x = self.d1(x)
    x = self.l2(x)
    x = self.sigmoid(x)
    return x

**Initialize the model, optimizer and the loss function used.**

In [23]:
model = AutoNet()
model = model.to(device)
model.train()
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
loss_criteria = nn.BCELoss()
loss_criteria = loss_criteria.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




  "num_layers={}".format(dropout, num_layers))


In [24]:
from torchsummary import summary
print(summary(model))

Layer (type:depth-idx)                   Param #
├─BertModel: 1-1                         --
|    └─BertEmbeddings: 2-1               --
|    |    └─Embedding: 3-1               (23,440,896)
|    |    └─Embedding: 3-2               (393,216)
|    |    └─Embedding: 3-3               (1,536)
|    |    └─LayerNorm: 3-4               (1,536)
|    |    └─Dropout: 3-5                 --
|    └─BertEncoder: 2-2                  --
|    |    └─ModuleList: 3-6              (85,054,464)
|    └─BertPooler: 2-3                   --
|    |    └─Linear: 3-7                  (590,592)
|    |    └─Tanh: 3-8                    --
├─LSTM: 1-2                              328,000
├─Linear: 1-3                            5,050
├─Dropout: 1-4                           --
├─Linear: 1-5                            306
├─Sigmoid: 1-6                           --
Total params: 109,815,596
Trainable params: 333,356
Non-trainable params: 109,482,240
Layer (type:depth-idx)                   Param #
├─BertModel: 1-

**Training the Model**

In [43]:
model.train()
from tqdm.notebook import tqdm
for epoch in tqdm(range(EPOCHS)):
  count = 0
  total_loss =0 
  model.train()
  for i,data in enumerate(train_set):
    optimizer.zero_grad()
    enc = tokenizer.batch_encode_plus(list(data[0]), pad_to_max_length=True, max_length=MAX_SEQ_LEN, 
                                return_tensors='pt', add_special_tokens=True, return_attention_mask=True,
                                return_token_type_ids=False, )
    input_ids = enc['input_ids'].to(device)
    attention_mask = enc['attention_mask'].to(device)
    labels = torch.tensor(data[1]).to(device)
    out = model(input_ids=input_ids, attention_mask=attention_mask)
    loss = loss_criteria(out, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.detach().data
    if (i+1)%256==0:
        print(f"Epoch: {epoch}, batch: {i+1}, loss: {total_loss/(BATCH_SIZE)}")
        total_loss = 0
  torch.save(model.state_dict(), f'{model_name}NetBCE_{epoch}.pt')

  model.eval()
  all_pred = []
  all_gold = []
  with torch.no_grad():
      for i,data in enumerate(val_set):
        enc = tokenizer.batch_encode_plus(list(data[0]), pad_to_max_length=True, max_length=MAX_SEQ_LEN, return_tensors='pt')
        input_ids = enc['input_ids'].to(device)
        attention_mask = enc['attention_mask'].to(device)
        labels = torch.tensor(data[1]).to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        all_pred.extend(1*(out>0.9).clone().detach().cpu().numpy())
        all_gold.extend((labels.type(torch.LongTensor).detach().cpu().numpy()))

  count=0
  for i in range(len(all_gold)):
    if (all_gold[i]==all_pred[i]).all():
      count+=1
  print("Validation accuracy:", count/len(all_gold))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

  


Epoch: 0, batch: 256, loss: 0.10895722359418869
Epoch: 0, batch: 512, loss: 0.11123456805944443
Epoch: 0, batch: 768, loss: 0.11127224564552307
Epoch: 0, batch: 1024, loss: 0.10830515623092651




Validation accuracy: 0.9091991477628776



**Code cell for demo and some error analysis**

In [44]:
sample_text = "You are such an amazing person"
clean_txt = clean_string(sample_text)
out1 = tokenizer.encode_plus(list(clean_txt), pad_to_max_length=True, max_length=MAX_SEQ_LEN, return_tensors='pt')
input_ids = out1['input_ids'].to(device)
attention_mask = out1['attention_mask'].to(device)
model.eval()
with torch.no_grad():
    preds = model(input_ids=input_ids, attention_mask=attention_mask)
print(preds)

tensor([[0.1249, 0.0051, 0.0325, 0.0118, 0.0502, 0.0076]], device='cuda:0')


In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(all_gold, all_pred)

0.9072565484396541

**Load Test data, preprocess and analyse**

In [45]:
test_labels_path = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'
test_labels = pd.read_csv(test_labels_path)

In [46]:
test_labels = test_labels.replace(to_replace=-1,value=0)
test_labels.sample(20)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
130079,d96a653932db1804,0,0,0,0,0,0
79187,8426e4aef1a856fb,0,0,0,0,0,0
55237,5be811149a5acb1e,0,0,0,0,0,0
91958,994ea229d0438f99,0,0,0,0,0,0
147191,f60c1082ee56b0ae,0,0,0,0,0,0
51712,55d5519dabb96d3c,1,0,1,0,1,0
124063,cf31575e00845e2e,0,0,0,0,0,0
85496,8e9eb9b80342d650,0,0,0,0,0,0
45089,4ac4bf0c55c922d2,0,0,0,0,0,0
152810,ff63cdbc6195fc5b,0,0,0,0,0,0


In [47]:
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

test_set = test.merge(test_labels, left_index=True, right_index=True)
test_set = test_set[["id_x", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
test_set = test_set.reset_index(drop=True)
test_set = test_set.rename(columns={"id_x": "id"})
test_set.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0,0,0,0,0,0
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,0,0,0,0,0
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,0,0,0,0,0
3,00017563c3f7919a,":If you have a look back at the source, the in...",0,0,0,0,0,0
4,00017695ad8997eb,I don't anonymously edit articles at all.,0,0,0,0,0,0


**Perform test data cleaning and load into Dataloader**

In [50]:
test_set['comment_text'] = test_set['comment_text'].apply(clean_string)
ids = test_set['id']
test_set = MyData(test_set, classes)
test_set = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Model evaluation**

In [49]:
model.eval()
all_pred = []
all_gold = []
with torch.no_grad():
    for i,data in enumerate(test_set):
        enc = tokenizer.batch_encode_plus(list(data[0]), pad_to_max_length=True, max_length=MAX_SEQ_LEN, return_tensors='pt')
        input_ids = enc['input_ids'].to(device)
        attention_mask = enc['attention_mask'].to(device)
        labels = torch.tensor(data[1]).to(device)
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        all_pred.extend(out.clone().detach().cpu().numpy())
        all_gold.extend((labels.type(torch.LongTensor).detach().cpu().numpy()))




TypeError: new(): invalid data type 'str'

**Test performance with 0.98 as the threshold score**

In [62]:
accuracy_score(all_gold, (np.array(all_pred) > 0.98))

0.9454440991355672

**Create CSV file for Kaggle Submission**

In [36]:
target_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [37]:
ids = pd.Series(ids)
y_preds = pd.DataFrame(all_pred, columns=target_columns)
final_submission = pd.concat([ids, y_preds], axis=1)
final_submission.head()
final_submission.to_csv('submission_bert_2_bce_epochs.csv', index=False)