<a href="https://colab.research.google.com/github/kshitijahande/Hate-Detection/blob/main/3_classifier_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qq ipynb
!pip install -qq plotly
!pip install -qq import-ipynb
!pip install -qq icecream
!pip install -qq transformers
!pip install -qq datasets


  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.6 MB 8.4 MB/s 
[K     |████████████████████████████████| 636 kB 59.3 MB/s 
[K     |████████████████████████████████| 895 kB 59.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 60.0 MB/s 
[K     |████████████████████████████████| 264 kB 8.2 MB/s 
[K     |████████████████████████████████| 243 kB 65.9 MB/s 
[K     |████████████████████████████████| 118 kB 68.9 MB/s 
[K     |████████████████████████████████| 76 kB 6.5 MB/s 
[?25h

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import random as rn
from icecream import ic 
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import os

In [None]:
# SELECT DEVICE CPU/CUDA
cuda_available = torch.cuda.is_available()
device   = torch.device("cuda") if (cuda_available) else torch.device("cpu")

seed=1
torch.manual_seed(seed)
if(cuda_available):
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.benchmark = True
np.random.seed(seed)

ic(torch.cuda.is_available(), device, seed)

ic| torch.cuda.is_available(): True
    device: device(type='cuda')
    seed: 1


(True, device(type='cuda'), 1)

# Load Dataset from Hugging Face library

In [None]:
dataset = load_dataset('hate_speech_offensive', split='train')


Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/823 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset hate_speech_offensive/default (download: 2.43 MiB, generated: 3.06 MiB, post-processed: Unknown size, total: 5.49 MiB) to /root/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5...


Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset hate_speech_offensive downloaded and prepared to /root/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5. Subsequent calls will reuse this data.


In [None]:
#Columns in dataset
ic(dataset[1]['tweet'])
ic(dataset[1])
ic(dataset.column_names)
ic(dataset.num_rows)

"""
Task 1 - multiclass classification
  "class":[
              0:"hate speech"
              1:"offensive language"
              2:"neither"
  ]
"""

ic| dataset[1]['tweet']: ('!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the '
                          '1st place!!')
ic| dataset[1]: {'class': 1,
                 'count': 3,
                 'hate_speech_count': 0,
                 'neither_count': 0,
                 'offensive_language_count': 3,
                 'tweet': '!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe '
                          'in the 1st place!!'}
ic| dataset.column_names: ['count',
                           'hate_speech_count',
                           'offensive_language_count',
                           'neither_count',
                           'class',
                           'tweet']
ic| dataset.num_rows: 24783


24783

In [None]:
#Shuffle and split train and test dataset randomly
train_test_split = dataset.train_test_split(shuffle=True)

In [None]:
#Get train and set data from original dataset
test = train_test_split.get('test')
train = train_test_split.get('train')

#convert to data frame
test_df = test.to_pandas()
train_full_df = train.to_pandas()

# split train dev/validation set
train_df, val_df = train_test_split(train_full_df, test_size=0.2)


In [None]:
# split train dev/validation set
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_full_df, test_size=0.2)


In [None]:
# Match total rows in each data frame
ic(train_df.shape[0])
ic(test_df.shape[0])
ic(val_df.shape[0])
total = train_df.shape[0] + test_df.shape[0] + val_df.shape[0]
ic(total)
ic(dataset.num_rows)

ic| train_df.shape[0]: 14869
ic| test_df.shape[0]: 6196
ic| val_df.shape[0]: 3718
ic| total: 24783
ic| dataset.num_rows: 24783


24783

In [None]:
# Hate vs non-hate distribution in data
train_df['class'].value_counts(ascending=True)


0      869
2     2552
1    11448
Name: class, dtype: int64

# Dataset and Model

In [None]:
#AVERAGE LOSS
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
# CUSTOM DATASET CLASS
class MyCustomDataset(Dataset): #dataset from torch
    def __init__(self, dataframe, basenet= 'bert-base-uncased', max_len= 128):
        super(MyCustomDataset, self).__init__()
        self.data    = dataframe #train_df
        self.labels = self.data['class']
        self.text = self.data['tweet']
        self.tokenizer = AutoTokenizer.from_pretrained(basenet)
        self.max_len = max_len
        self.segment_id = torch.tensor([1] * self.max_len).view(1, -1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]

        label = torch.tensor(label)

        encoded_input = self.tokenizer(text= text, # sentence
							 add_special_tokens= True, # [CLS] [SEP]
							 max_length= self.max_len, 
							 padding= 'max_length', 
							 return_attention_mask= True,
							 return_tensors = 'pt',  # return torch tensors
							 truncation= True
							)
        item = {"input_id": encoded_input['input_ids'], 
                "attention_mask_id": encoded_input['attention_mask'],
                "segment_id": self.segment_id,
                "true_label": label
              }
        
        return item 

In [None]:
#MODEL ARCHITECTURE CLASS
class MyClassifier(torch.nn.Module):

    def __init__(self, basenet='bert-base-uncased', n_outputs=2, n_hidden=256):
        super(MyClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained( basenet, output_hidden_states = False)
        self.classifier = torch.nn.Sequential(
                            torch.nn.Linear(in_features= 768, out_features= n_hidden),
													  torch.nn.ReLU(),
													  torch.nn.Dropout(p= 0.2),
													  torch.nn.Linear(in_features= 256, out_features= n_outputs)
                          ) 
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_id, attention_mask_id, token_type_id):
        embedding = self.encoder(input_ids= input_id, attention_mask= attention_mask_id, token_type_ids= token_type_id)['pooler_output']
        out = self.classifier(embedding)
        # ic(out.shape) #[batch size, n_outputs]
        return out

# Train and Test function

In [None]:
# TRAINING FUNCTION
def train(model, 
          dl, 
          optimizer, 
          device,
          criterion = torch.nn.CrossEntropyLoss()
          ):
    model.train()
    loss_avg = AverageMeter()
    
    for _, batch in enumerate(dl): #iterate in batch
      # ic(batch['input_id'].shape) #[32, 1, 128] (batch size, [one sentence,max-length of input])
      targets = batch['true_label'].to(device)
      input_id   = batch['input_id'].to(device).squeeze()
      attention_mask_id  = batch['attention_mask_id'].to(device).squeeze()
      segment_id = batch['segment_id'].to(device).squeeze()

      outputs = model(input_id, attention_mask_id, segment_id) # [32, 2] [batch size, model n_outputs]
      t = targets.float().view(-1,1) #[32,1]

      # ic(outputs[:, 0:2].shape, t[:, 0].shape) #outputs - first 2 cols of all rows
      loss = criterion(outputs[:, 0:2], t[:, 0].long())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      loss_avg.update(loss.item(), targets.shape[0]) #[32,1] batch size
    
    return loss_avg.avg


# TESTING/EVALUATION FUNCTION

from sklearn.metrics import f1_score, accuracy_score

def test(model, 
         dl, 
         device, 
         criterion = torch.nn.CrossEntropyLoss()):
    model.eval()
    loss_avg = AverageMeter()
    o_   = torch.tensor([]).to(device) #use for metrics
    t_   = torch.tensor([]).to(device)

    with torch.no_grad():
        for _, batch in enumerate(dl):
            targets = batch['true_label'].to(device) #[128] batch size
            input_id   = batch['input_id'].to(device).squeeze()
            attention_mask_id  = batch['attention_mask_id'].to(device).squeeze()
            segment_id = batch['segment_id'].to(device).squeeze()

            outputs = model(input_id, attention_mask_id, segment_id)
            t = targets.float().view(-1,1) #[128,1] batch size
            loss = criterion(outputs[:, 0:2], t[:, 0].long())

            # EVALUATE
            max_o = torch.argmax(outputs[:, 0:2], dim=1).view(-1,1)
            # ic(max_o.shape) #[128,1]
            max_o = max_o.detach()
            o_    = torch.cat((o_, max_o)  , 0) #float and append
            t_    = torch.cat((t_, targets), 0)
            loss_avg.update(loss.item(), targets.shape[0])

    o_ = o_.cpu().numpy()
    t_ = t_.view(-1,1)# change shape same as o_
    t_ = t_.cpu().numpy()
    ic(t_.shape, o_.shape)
    fscore = f1_score(t_[:, 0], o_[:, 0])
    accuracy= accuracy_score(t_[:, 0], o_[:, 0])

    return loss_avg.avg, [fscore, accuracy]


# Data Loader

In [None]:
# LOAD DATASET 
train_dl = torch.utils.data.DataLoader(dataset= MyCustomDataset(
                          dataframe =  train_df,
                          basenet= 'bert-base-uncased'),
                           batch_size= 32, shuffle= True, num_workers= 32, pin_memory= True)
val_dl = torch.utils.data.DataLoader(dataset= MyCustomDataset(
                          dataframe =  val_df,
                          basenet= 'bert-base-uncased'),
                           batch_size= 128, shuffle= True, num_workers= 32, pin_memory= True)
test_dl = torch.utils.data.DataLoader(dataset= MyCustomDataset(
                          dataframe =  test_df,
                          basenet= 'bert-base-uncased'),
                           batch_size= 128, shuffle= True, num_workers= 32, pin_memory= True)


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  cpuset_checked))


# Model Selection

In [None]:
# MODEL SELECTION
model = MyClassifier('bert-base-uncased', n_outputs=2) #experiment - bert with fc

if torch.cuda.device_count() > 1:
	model = torch.nn.DataParallel(model)
 
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Start Training

In [None]:
# START TRAINING
learning_rate=2e-5 
start_epoch = 1
epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)
train_loss_arr = []
val_loss_arr = []
metrics_arr = [] #[f1 , accuracy]

for ep in range(start_epoch, start_epoch+epochs):
  train_loss = train(model, train_dl, optimizer, device)
  val_loss, metrics = test(model, val_dl, device)
  train_loss_arr.append(train_loss)
  val_loss_arr.append(val_loss)
  metrics_arr.append(metrics)
  ic()

# SAVE WEIGHTS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
%cd /content/drive/MyDrive/Colab Notebooks/Spring 21/Research Methodology (RM)
!ls
save_dir = os.path.join('weights')
if not os.path.exists(save_dir):
  print("Creating directory {}".format(save_dir))
  os.makedirs(save_dir)



/content/drive/MyDrive/Colab Notebooks/Spring 21/Research Methodology (RM)
/content/drive/MyDrive/Colab Notebooks/Spring 21/Research Methodology (RM)
 2-classifier-tfidf-pos-logistic-regression.ipynb  'Copy of my-bert-main.ipynb'
