# Data Loading

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('NLP_ass_train.tsv', sep='\t')
val = pd.read_csv('NLP_ass_valid.tsv', sep='\t')
test = pd.read_csv('NLP_ass_test.tsv', sep='\t')

train.columns = ['text', 'label']
val.columns = ['text', 'label']
test.columns = ['text', 'label']

In [3]:
train.head()

Unnamed: 0,text,label
0,nawt yall niggers ignoring me,normal
1,<user> i am bit confused coz chinese ppl can n...,hatespeech
2,and this is why i end up with nigger trainee d...,hatespeech
3,nogs jews and dykes how enriching,offensive
4,guilty until proven innocent unless you are a ...,hatespeech


In [4]:
test.head()

Unnamed: 0,text,label
0,laura loomer raped me while screaming at me in...,hatespeech
1,<user> what did the old lady do was she a nazi...,normal
2,as much as i appreciate a format of open discu...,normal
3,sex be so good a bitch be slow stroking and cr...,offensive
4,<user> <user> <user> with a room full of white...,normal


# Data Preprocessing

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
import re

def preprocess(text):
  clean_text = re.sub(r'[^a-zA-Z0-9\s\!\.\?,]', '', text)
  tokens = clean_text.split()
  # stop_words = set(stopwords.words('english'))
  # tokens = [token for token in tokens if token.lower() not in stop_words]
  filtered_text = ' '.join(tokens)
  return filtered_text

In [7]:
train['text'] = train['text'].apply(preprocess)
val['text'] = val['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)

In [8]:
train['text']

0                           nawt yall niggers ignoring me
1       user i am bit confused coz chinese ppl can not...
2       and this is why i end up with nigger trainee d...
3                       nogs jews and dykes how enriching
4       guilty until proven innocent unless you are a ...
                              ...                        
7934    in new york city a new law is coming that woul...
7935    news for tacha fans who support pride violence...
7936                           user i saw happy thank you
7937    i had said before and how most fast we could g...
7938    listen of all the things i googled for a book ...
Name: text, Length: 7939, dtype: object

# Model Creation

In [9]:
!pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transforme

In [10]:
import torch
from transformers import AutoTokenizer

In [11]:
model_id = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Dataset

In [13]:
from torch.utils.data import Dataset, DataLoader

label2id = {
    'normal': 0,
    'hatespeech': 1,
    'offensive': 2
}


class HateSpeechDataset(Dataset):
  def __init__(self, data, tokenizer, max_length):
    self.text = list(data['text'])
    self.labels = list(data['label'].apply(lambda x: label2id[x]))
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.text)

  def __getitem__(self, idx):
    text = self.text[idx]
    label = self.labels[idx]

    encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': torch.tensor(label, dtype=torch.long)
    }

In [14]:
max_length = 512
train_dataset = HateSpeechDataset(train, tokenizer, max_length)
val_dataset = HateSpeechDataset(val, tokenizer, max_length)
test_dataset = HateSpeechDataset(test, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Model Class

In [21]:
import torch.nn as nn
from transformers import BertForSequenceClassification

class BERTModel(nn.Module):
  def __init__(self, num_labels, model_name = 'bert-base-uncased'):
    super(BERTModel, self).__init__()
    self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

  def forward(self, input_ids, attention_mask):
    out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    return out.logits

In [22]:
num_labels = len(label2id)
model = BERTModel(num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
lr = 1e-5
epochs = 5

In [24]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model.parameters(), lr=lr)

loss_fn = nn.CrossEntropyLoss()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)

BERTModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=

In [29]:
prev_val_acc = -1
patience = 3
k = 0

for epoch in range(epochs):
    train_loss = 0.0
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()

    for batch_idx, batch in tqdm(enumerate(train_dataloader)):
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['label'].to(device)
        # labels = batch['label'].unsqueeze(0)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask)

        # labels = labels.type_as(outputs)

        loss = loss_fn(outputs, labels)
        loss.backward()

        optimizer.step()

        train_loss += loss.item()
        # print(loss.item())

    train_loss /= len(train_dataloader)
    model.eval()

    val_loss = 0.0
    val_samples = 0
    val_correct = 0

    with torch.no_grad():
      for batch_idx, batch in tqdm(enumerate(val_dataloader)):
          input_ids = batch['input_ids'].squeeze(1).to(device)
          attention_mask = batch['attention_mask'].squeeze(1).to(device)
          labels = batch['label'].to(device)
          # labels = batch['label'].unsqueeze(0)

          outputs = model(
              input_ids=input_ids,
              attention_mask=attention_mask)

          # labels = labels.type_as(outputs)

          loss = loss_fn(outputs, labels)
          val_loss += loss.item()

          predictions = torch.argmax(outputs, axis=1)

          num_correct = sum(1 for a, b in zip(predictions, labels) if a == b)
          num_samples = predictions.shape[0]

          val_samples += num_samples
          val_correct += num_correct

    val_acc = val_correct / val_samples
    val_loss /= len(val_dataloader)

    if val_acc < prev_val_acc:
      k += 1

    if k == patience:
      break

    prev_val_acc = val_acc

    print(f"Train loss : {train_loss}, Val loss : {val_loss}, Val acc : {val_acc}")

Epoch 1/5


497it [11:06,  1.34s/it]
121it [00:59,  2.03it/s]


Train loss : 0.4185925514945082, Val loss : 0.9390342416287947, Val acc : 0.6684018740239459
Epoch 2/5


497it [11:08,  1.35s/it]
121it [00:59,  2.03it/s]


Train loss : 0.3284333016371103, Val loss : 1.1473595122537337, Val acc : 0.6673607496095784
Epoch 3/5


497it [11:12,  1.35s/it]
121it [00:59,  2.03it/s]


Train loss : 0.24020632186256186, Val loss : 1.1273991323945936, Val acc : 0.6668401874023946
Epoch 4/5


497it [11:07,  1.34s/it]
121it [00:59,  2.03it/s]


In [32]:
from sklearn.metrics import f1_score

test_samples = 0
test_correct = 0

total_labels = []
total_predictions = []

with torch.no_grad():
  for batch_idx, batch in tqdm(enumerate(test_dataloader)):
      input_ids = batch['input_ids'].squeeze(1).to(device)
      attention_mask = batch['attention_mask'].squeeze(1).to(device)
      labels = batch['label'].to(device)
      # labels = batch['label'].unsqueeze(0)

      outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask)

      # labels = labels.type_as(outputs)

      predictions = torch.argmax(outputs, axis=1).cpu()

      total_predictions.extend(predictions)
      total_labels.extend(labels.cpu())

      num_correct = sum(1 for a, b in zip(predictions, labels) if a == b)
      num_samples = predictions.shape[0]

      test_samples += num_samples
      test_correct += num_correct

macro_f1 = f1_score(total_labels, total_predictions, average='macro')
test_acc = test_correct/test_samples

print(f"Test acc : {test_acc:.4f}")
print(f"Test macro-F1 : {macro_f1:.4f}")

121it [00:59,  2.04it/s]

Test acc : 0.6417
Test macro-F1 : 0.5898





In [33]:
def find_common_strings(sentences1, sentences2):
  sent1 = set(sentences1)
  sent2 = set(sentences2)

  return len(sent1.intersection(sent2))

In [34]:
train_text = list(train['text'])
val_text = list(val['text'])
test_text = list(test['text'])

print("Number of common sentences (train, test): ", find_common_strings(train_text, test_text))
print("Number of common sentences (val, test): ", find_common_strings(val_text, test_text))

Number of common sentences (train, test):  2
Number of common sentences (val, test):  1
