In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/NLP\ Project

Mounted at /content/gdrive
/content/gdrive/MyDrive/NLP Project


In [12]:
! pip install -qq transformers

In [44]:
import numpy as np
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader

# Import Data

In [7]:
with open("HateXPlainData/trainHateXplain", "rb") as file:
  train_data = pickle.load(file)

with open("HateXPlainData/valHateXplain", "rb") as file:
  val_data = pickle.load(file)

with open("HateXPlainData/testHateXplain", "rb") as file:
  test_data = pickle.load(file)

In [30]:
print(len(train_data), len(val_data), len(test_data))

15383 1922 1924


# Data Pre-Processing

In [32]:
BERT_MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 100
BATCH_SIZE = 240

tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL_NAME)
labels = {'offensive': 2, 'hatespeech': 1, 'normal': 0}

In [42]:
class DataSet:
  def __init__(self, data):
    self.labels = []
    self.text = []

    for post in data:
      self.labels.append(labels[post[1]])
      self.text.append(tokenizer.encode_plus(" ".join(post[0]), 
                                             add_special_tokens=True,
                                             truncation=True, 
                                             max_length=MAX_LEN, 
                                             return_token_type_ids=False, 
                                             padding='max_length', 
                                             return_attention_mask=True, 
                                             return_tensors='pt'
                                             )
                      )
      
  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_texts = self.text[idx]
    batch_y = np.array(self.labels[idx])

    return batch_texts, batch_y

In [45]:
train, val, test = DataSet(train_data), DataSet(val_data), DataSet(test_data)
train_dataloader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test, batch_size=BATCH_SIZE)

In [61]:
data = next(iter(train_dataloader))
print(data)

[{'input_ids': tensor([[[  101, 22091,  1996,  ...,     0,     0,     0]],

        [[  101, 12461,  2024,  ...,     0,     0,     0]],

        [[  101,  4827,  2003,  ...,     0,     0,     0]],

        ...,

        [[  101,  3398,  2009,  ...,     0,     0,     0]],

        [[  101,  2043, 10643,  ...,     0,     0,     0]],

        [[  101,  2017,  2828,  ...,     0,     0,     0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])}, tensor([2, 1, 0, 1, 0, 2, 2, 2, 2, 1, 0, 0, 2, 0, 2, 0, 0, 1, 0, 1, 0, 1, 2, 0,
        0, 2, 2, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 1, 0, 2, 0, 1, 2, 0, 0, 2, 1,
        1, 0, 2, 0, 0, 1, 1, 1, 1, 2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 0, 1, 1, 1,
        0, 1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 1, 0, 2, 0,
        2, 2, 1, 0, 2, 1, 0,

In [62]:
model = DistilBertModel.from_pretrained(BERT_MODEL_NAME)
input = data[0]['input_ids'].squeeze(1)
mask = data[0]['attention_mask']
last_hidden_state, pooled_output = model(input_ids=input, attention_mask=mask)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: ignored

# Tweet Classification Model w/ DistilBERT

In [63]:
class TweetClassifier(nn.Module):
  def __init__(self, num_classes):
    super(TweetClassifier, self).__init__()

    self.bert = DistilBertModel.from_pretrained(BERT_MODEL_NAME)
    self.drop_layer = nn.Dropout(p=0.3)
    self.out_layer = nn.Linear(self.bert.config.hidden_size, num_classes)
    self.out_act = nn.Softmax(dim=1)

  def forward(self, input, mask):

    _, out = self.bert(input_ids=input, attention_mask=mask, return_dict=False)
    drop_out = self.drop_layer(out)
    output = self.out_layer(drop_out)
    return self.out_act(output)

In [64]:
tmodel = TweetClassifier(len(labels))

tmodel(data[0]['input_ids'].squeeze(1), data[0]['attention_mask'])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: ignored