In [1]:
#Importing Dependencies

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader 
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
#Setting up device

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.has_mps:
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(f"Current device: {device}")

Current device: cuda


In [3]:
#Key Parameters

MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [4]:
#Loading Training data

train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
print(f"Total Samples : {len(train_data)}")

train_data.head()

Total Samples : 159571


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
#Removing id column and preparing labels into the single list column

train_data.drop(['id'], inplace=True, axis=1)
train_data['labels'] = train_data.iloc[:, 1:].values.tolist()
train_data.drop(train_data.columns.values[1:-1].tolist(), inplace=True, axis=1)
train_data.head()

Unnamed: 0,comment_text,labels
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [6]:
# Data Cleaning
# Lower case
# Remove extra space

train_data["comment_text"] = train_data["comment_text"].str.lower()
train_data["comment_text"] = train_data["comment_text"].str.replace("\xa0", " ", regex=False).str.split().str.join(" ")
train_data['labels'] = train_data['labels'].apply(lambda x: np.array(x))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data['comment_text'], train_data['labels'],test_size=0.2)

In [8]:
y_train = np.array(y_train.tolist())
y_test = np.array(y_test.tolist())
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())

In [9]:
y_train.sum(axis=0)

array([12259,  1296,  6778,   385,  6340,  1131])

In [10]:
# Creating a dataset class that outputs the token id, token mask, and token type id of the required sentence

class MultiLabelDataset(Dataset):

    def __init__(self, X, y, tokenizer, max_len):
        self.text = X
        self.target = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        target = self.target[index]
        text = str(self.text[index])
        # text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        output = {
            'ids': torch.tensor(ids, dtype=torch.float32),
            'mask': torch.tensor(mask, dtype=torch.float32),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.float32),
            'target': torch.tensor(target, dtype=torch.float32)
         }
                
        return output

In [11]:
# Creating tokenizer object
# Creating dataset object
#Creating dataloader for the dataset
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

train_set = MultiLabelDataset(X_train, y_train, tokenizer, MAX_LEN)
train_loader = DataLoader(train_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

test_set = MultiLabelDataset(X_test, y_test, tokenizer, MAX_LEN)
test_loader = DataLoader(test_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [12]:
#Creating the custom DistilBERT model class

class DistilBERTClass(nn.Module):
    
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [13]:
#Creating model object

model = DistilBERTClass()
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [14]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [15]:
def train(epochs):
    mean_loss = 0
    mean_val_loss = 0
    step = 0
    val_step = 1
    print(f'Training for {epochs} epoch(s)')
    for epoch in range(1, epochs+1):
        model.train()
        for data in tqdm(train_loader):
            step+=1
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['target'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            
            mean_loss += loss.item()
            
        model.eval()
        with torch.inference_mode():
            for val_data in tqdm(test_loader):
                val_step+=1
                val_ids = val_data['ids'].to(device, dtype = torch.long)
                val_mask = val_data['mask'].to(device, dtype = torch.long)
                val_token_type_ids = val_data['token_type_ids'].to(device, dtype = torch.long)
                val_targets = val_data['target'].to(device, dtype = torch.float)

                val_outputs = model(val_ids, val_mask, val_token_type_ids)

                val_loss = loss_fn(val_outputs, val_targets)
                mean_val_loss+=val_loss.item()
                
        mean_loss /= step
        mean_val_loss /= val_step
        print(f'Epoch: {epoch}, Loss:  {mean_loss}, Val_Loss:  {mean_val_loss}')
        torch.save(model.state_dict(), 'model_checkpoint.pth')
        
    torch.save(model.state_dict(), f'model_{mean_val_loss:.3f}.pth')

In [16]:
train(EPOCHS)

Training for 2 epoch(s)


  0%|          | 0/3990 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 3990/3990 [31:36<00:00,  2.10it/s]
100%|██████████| 998/998 [03:41<00:00,  4.50it/s]


Epoch: 1, Loss:  0.05238986257393669, Val_Loss:  0.03887622743167703


100%|██████████| 3990/3990 [31:37<00:00,  2.10it/s]
100%|██████████| 998/998 [03:40<00:00,  4.52it/s]


Epoch: 2, Loss:  0.01756992314089262, Val_Loss:  0.01843887561265496


In [19]:
print('five')

five
