In [1]:
import random
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
#from apex import amp

# ensure reproducability
#torch.seed(25)
torch.manual_seed(25)
torch.cuda.manual_seed_all(25)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(25)
random.seed(25)


In [2]:
# Initialization
opt_level = 'O1'
#apex.amp.initialize(model, optimizer, opt_level=opt_level)

torch.cuda.amp.GradScaler(
    init_scale=65536.0, growth_factor=2.0, backoff_factor=0.5,
    growth_interval=2000, enabled=True
)


<torch.cuda.amp.grad_scaler.GradScaler at 0x2071b463160>

In [3]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device = 'cuda'

In [4]:
print(device)

cuda


source
https://colab.research.google.com/drive/1d8N-ZLDS4FcmBDDG19gxbt_lSKglnwKq#scrollTo=mgX-jLdjw3Hl <br>
https://github.com/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb

<br>
optimized with: <br>
https://pytorch.org/docs/stable/notes/amp_examples.html

In [5]:
df = pd.read_csv("./data/train.csv")
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [6]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()
# Initializing a model from the bert-base-uncased style configuration


In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        # inputs = self.tokenizer.encode_plus(
        #    comment_text,
        #    None,
        #    add_special_tokens=True,
        #    truncation=True,
        #    max_length=self.max_len,
        #    pad_to_max_length=True,
        #    #padding=True,
        #    #padding='longest',
        #    return_token_type_ids=True
        # )
        inputs = tokenizer.encode_plus(
                    comment_text, 
                    add_special_tokens = True,    
                    truncation = True, 
                    padding = "max_length", 
                    return_attention_mask = True, 
                    #return_tensors = "pt"
                    return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
new_df[1:1000]

Unnamed: 0,comment_text,list
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"
5,"""\n\nCongratulations from me as well, use the ...","[0, 0, 0, 0, 0, 0]"
...,...,...
995,""" Hi, Writingrights, Welcome to Wikipedia! \n...","[0, 0, 0, 0, 0, 0]"
996,It is common knowledge that Karaims (but not K...,"[0, 0, 0, 0, 0, 0]"
997,", 12 April 2006 (UTC)\nThen rewrite and expand...","[0, 0, 0, 0, 0, 0]"
998,"""I was trying to inject some humour (as eviden...","[0, 0, 0, 0, 0, 0]"


In [10]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8

new_df2 = new_df[1:200]
train_dataset=new_df2.sample(frac=train_size,random_state=200)
test_dataset=new_df2.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df2.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (199, 2)
TRAIN Dataset: (159, 2)
TEST Dataset: (40, 2)


In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

#class BERTClass(torch.nn.Module):
class BertClass(transformers.PreTrainedModel):
    def __init__(self, config):
        super(BertClass, self).__init__(config)
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        #print(ids)
        #print(mask)
        _, output_1= self.l1(input_ids =ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BertClass(configuration)
# Accessing the model configuration
configuration = model.config
model.to(device)





BertClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [13]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [14]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [15]:
def train(epoch):
    scaler = torch.cuda.amp.GradScaler()
    model.train()

    for _,data in enumerate(training_loader, 0):
        optimizer.zero_grad()
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.half)
            #print(targets)
        with torch.cuda.amp.autocast():
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        loss = loss.detach().cpu().numpy()
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    #torch.cuda.empty_cache()
        #loss.backward()
        #optimizer.step()

In [16]:
for epoch in range(EPOCHS):
    torch.cuda.empty_cache()
    train(epoch)

Epoch: 0, Loss:  0.7017356753349304
Epoch: 0, Loss:  0.15899017453193665
Epoch: 1, Loss:  0.1477043330669403
Epoch: 1, Loss:  0.10058040916919708
Epoch: 2, Loss:  0.07676959037780762
Epoch: 2, Loss:  0.05006703361868858
Epoch: 3, Loss:  0.07001161575317383
Epoch: 3, Loss:  0.03317493200302124
Epoch: 4, Loss:  0.025030270218849182
Epoch: 4, Loss:  0.022473668679594994


In [17]:
 torch.cuda.empty_cache()

In [18]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [30]:
col = ['toxic', 'severe_toxic', 'obscene','threat', 'insult', 'indentity_hate']
#col = [0,1,2,3,4,5]

In [20]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    
    ml_cm = metrics.multilabel_confusion_matrix(targets,outputs, labels=col)

Accuracy Score = 0.85
F1 Score (Micro) = 0.48
F1 Score (Macro) = 0.273015873015873
Accuracy Score = 0.85
F1 Score (Micro) = 0.48
F1 Score (Macro) = 0.273015873015873
Accuracy Score = 0.85
F1 Score (Micro) = 0.48
F1 Score (Macro) = 0.273015873015873
Accuracy Score = 0.85
F1 Score (Micro) = 0.48
F1 Score (Macro) = 0.273015873015873
Accuracy Score = 0.85
F1 Score (Micro) = 0.48
F1 Score (Macro) = 0.273015873015873


In [21]:
type(ml_cm)
print(ml_cm)

[[[34  0]
  [ 3  3]]

 [[38  1]
  [ 1  0]]

 [[36  0]
  [ 3  1]]

 [[39  0]
  [ 1  0]]

 [[35  0]
  [ 3  2]]

 [[39  0]
  [ 1  0]]]


In [27]:
    outputs, targets = validation(epoch)


In [34]:
targets

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
 [1.0, 1.0, 1.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
 [0.0, 0.0