In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/NLP\ Project

Mounted at /content/gdrive
/content/gdrive/MyDrive/NLP Project


In [None]:
! pip install -qq transformers

[K     |████████████████████████████████| 3.4 MB 12.6 MB/s 
[K     |████████████████████████████████| 895 kB 40.4 MB/s 
[K     |████████████████████████████████| 61 kB 475 kB/s 
[K     |████████████████████████████████| 596 kB 42.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 44.1 MB/s 
[?25h

In [None]:
import numpy as np
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader

# Import Data

In [None]:
with open("HateXPlainData/trainHateXplain", "rb") as file:
  train_data = pickle.load(file)

with open("HateXPlainData/valHateXplain", "rb") as file:
  val_data = pickle.load(file)

with open("HateXPlainData/testHateXplain", "rb") as file:
  test_data = pickle.load(file)

# Data Pre-Processing

In [48]:
BERT_MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 100
BATCH_SIZE = 30

tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL_NAME)
labels = {'offensive': 2, 'hatespeech': 1, 'normal': 0}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [49]:
class DataSet:
  def __init__(self, data, tokenizer, class_labels):
    
    self.text, self.labels = zip(*data)
    self.tokenizer = tokenizer
    self.classes = class_labels
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):

    text_encoding = self.tokenizer.encode_plus(" ".join(self.text[idx]), 
                                               add_special_tokens=True,
                                               truncation=True,
                                               max_length=MAX_LEN, 
                                               return_token_type_ids=False, 
                                               padding='max_length', 
                                               return_attention_mask=True, 
                                               return_tensors='pt'
                                              )

    return {'text': " ".join(self.text[idx]),
            'input_ids': text_encoding['input_ids'].flatten(),
            'attention_mask': text_encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.classes[self.labels[idx]])
            }

In [50]:
train = DataSet(train_data, tokenizer, labels)
val = DataSet(val_data, tokenizer, labels)
test = DataSet(test_data, tokenizer, labels)
train_dataloader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test, batch_size=BATCH_SIZE)

In [51]:
data = next(iter(train_dataloader))
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['label'].shape)

torch.Size([30, 100])
torch.Size([30, 100])
torch.Size([30])


# Tweet Classification Model w/ DistilBERT

In [96]:
class TweetClassifier(nn.Module):
  def __init__(self, num_classes):
    super(TweetClassifier, self).__init__()

    self.bert = DistilBertModel.from_pretrained(BERT_MODEL_NAME)
    self.drop_layer = nn.Dropout(p=0.3)
    self.out_layer = nn.Linear(self.bert.config.hidden_size, num_classes)
    self.out_act = nn.Softmax(dim=1)
    self.loss_fn = nn.CrossEntropyLoss()

  def forward(self, input, mask):

    bert_output = self.bert(input_ids=input, attention_mask=mask)
    p_out = bert_output[0][:, 0]
    drop_out = self.drop_layer(p_out)
    output = self.out_layer(drop_out)
    return self.out_act(output)

  def fit(self, train_dl, val_dl, epochs, optim):

    train_data = train_dl
    train_size = len(train_data.dataset.text)

    optimizer = optim

    for i in range(epochs):
      self.train()
      train_loss = 0
      train_acc = 0
      print(f'Epochs: {i + 1}')

      for data in tqdm(train_data):
        inputs = data['input_ids'].to(device)
        masks = data['attention_mask'].to(device)
        targets = data['label'].to(device)

        outputs = self.forward(inputs, masks) 

        batch_loss = self.loss_fn(outputs, targets)
        train_loss += batch_loss.item()

        acc = (outputs.argmax(dim=1) == targets).sum().item()
        train_acc += acc

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

      print(f'Train Loss: {train_loss / train_size: .3f} | Train Accuracy: {train_acc / train_size: .3f}')

      self.evaluate(val_dl, calc_loss=True)

  def evaluate(self, eval_dl, calc_loss=False):
    self.eval()

    eval_data = eval_dl
    eval_size = len(eval_data.dataset.text)
    eval_acc = 0
    if calc_loss:
      eval_loss = 0

    predictions = torch.tensor([])

    with torch.no_grad():
      
      for data in tqdm(eval_data):
        inputs = data['input_ids'].to(device)
        masks = data['attention_mask'].to(device)
        targets = data['label'].to(device)

        outputs = self.forward(inputs, masks)

        if calc_loss:
          batch_loss = self.loss_fn(outputs, targets)
          eval_loss += batch_loss.item()

        batch_pred = outputs.argmax(dim=1)
        predictions = torch.cat((predictions, batch_pred.to(device="cpu")), 0)
        acc = (batch_pred == targets).sum().item()
        eval_acc += acc

    if calc_loss:
      print(f'Eval Loss: {eval_loss / eval_size: .3f} | Eval Accuracy: {eval_acc / eval_size: .3f}')
    else:
      print(f'Eval Accuracy: {eval_acc / eval_size: .3f}')

    return predictions


In [103]:
tmodel = TweetClassifier(len(labels)).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
optim = Adam(tmodel.parameters(), lr=1e-4)

tmodel.fit(train_dataloader, val_dataloader, 5, optim)

Epochs: 1


100%|██████████| 513/513 [04:16<00:00,  2.00it/s]


Train Loss:  0.030 | Train Accuracy:  0.625


100%|██████████| 65/65 [00:13<00:00,  4.83it/s]


Eval Loss:  0.030 | Eval Accuracy:  0.647
Epochs: 2


100%|██████████| 513/513 [04:16<00:00,  2.00it/s]


Train Loss:  0.030 | Train Accuracy:  0.659


100%|██████████| 65/65 [00:13<00:00,  4.84it/s]


Eval Loss:  0.030 | Eval Accuracy:  0.649
Epochs: 3


100%|██████████| 513/513 [04:16<00:00,  2.00it/s]


Train Loss:  0.029 | Train Accuracy:  0.689


100%|██████████| 65/65 [00:13<00:00,  4.85it/s]


Eval Loss:  0.030 | Eval Accuracy:  0.663
Epochs: 4


100%|██████████| 513/513 [04:16<00:00,  2.00it/s]


Train Loss:  0.028 | Train Accuracy:  0.710


100%|██████████| 65/65 [00:13<00:00,  4.83it/s]


Eval Loss:  0.030 | Eval Accuracy:  0.661
Epochs: 5


100%|██████████| 513/513 [04:16<00:00,  2.00it/s]


Train Loss:  0.028 | Train Accuracy:  0.712


100%|██████████| 65/65 [00:13<00:00,  4.83it/s]

Eval Loss:  0.030 | Eval Accuracy:  0.647





In [112]:
y_true = np.array([labels[label] for label in test.labels])
y_pred = tmodel.evaluate(test_dataloader)

100%|██████████| 65/65 [00:15<00:00,  4.31it/s]

Eval Accuracy:  0.652





In [113]:
print(y_pred)
print(y_true)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

tensor([0., 0., 2.,  ..., 2., 1., 1.])
[0 0 2 ... 2 1 1]
              precision    recall  f1-score   support

           0       0.70      0.73      0.71       782
           1       0.71      0.73      0.72       594
           2       0.51      0.46      0.48       548

    accuracy                           0.65      1924
   macro avg       0.64      0.64      0.64      1924
weighted avg       0.65      0.65      0.65      1924

[[572  66 144]
 [ 61 431 102]
 [188 109 251]]
