In [1]:
#!pip install transformers
#!git clone https://github.com/joseph1723/CS376_Final_Project.git

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#'murali1996/bert-base-cased-spell-correction'
model = BertForSequenceClassification.from_pretrained('bert-base-cased')

if torch.cuda.is_available() :
  print("CUDA")
  device = torch.device("cuda")
  model.to(device)
elif torch.backends.mps.is_available() :
  print("M1-mps")
  device = torch.device("mps")
  model.to(device)
else :
  print("CPU")
  device = torch.device("cpu")
  model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

M1-mps


In [4]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    label = self.df.iloc[idx, 1]
    return text, label

Hyperparameters

In [5]:
train_rate, test_rate = 0.5, 0.05
itr = 1
p_itr = 100
epochs = 5
batch = 10

In [6]:
total_df = pd.read_csv('augmented_data/Dataset_aug_complex_6513_.csv', sep=',')
total_df.dropna(inplace=True)
total_df = total_df[["text", "label"]]
total_df["label"] = [1 if i == "nothate" else 0 for i in total_df["label"]]
print(total_df)
total_dataset = TestDataset(total_df)
total_loader = DataLoader(total_dataset, batch_size=batch, shuffle=True)
max_len = 0

for text, _ in total_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    l = max(len(e) for e in encoded_list)
    max_len = l if l>max_len else max_len

print(max_len)

                                                    text  label
0                            gypsies don't do shit wrong      1
1                            gypsies don't do aYit wrong      1
2                Hello all sorts of nasty shit, I'm dad!      1
3                Hello all sorts of nasty sMi6, I'm dad!      1
4       Don't be such a pussy or I will call you a homo!      0
...                                                  ...    ...
13021  We're full up, fjnt, f7vk off back to whicheve...      0
13022  Yeah didn't he hear? Black people aren't allow...      1
13023  Yeah didn't he hear? Black people aren't allow...      1
13024  i have realised and found out people are reall...      1
13025  i have realised and found out people are reall...      1

[13026 rows x 2 columns]
137


In [7]:
#Train Set
test_df, train_df, _ = np.split(total_df, [int(test_rate*len(total_df)), int(test_rate*len(total_df) + train_rate*len(total_df))])
print(len(test_df), len(train_df))
train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

651 6513


In [8]:
class CustomBERTModel(nn.Module):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
          self.bert = BertModel.from_pretrained('bert-base-cased')
          # add your additional layers here, for example a dropout layer followed by a linear classification head
          self.dropout = nn.Dropout(0.3)
          self.out = nn.Linear(768, 1)

    def forward(self, ids, mask=None, token_type_ids=None, labels=None):
          sequence_output, pooled_output = self.bert(
               ids, 
               attention_mask=mask,
               token_type_ids=token_type_ids,
               return_dict=False
          )

          # we apply dropout to the pooled output, tensor has shape (batch_size, 1, 768)
          #sequence_output = self.dropout(sequence_output)
          pooled_output = self.dropout(pooled_output)
    
          # next, we apply the linear layer. (which applies a linear transformation)
          # (batch_size, 1, 768) -> (batch_size, 1, 1)
          logits = self.out(pooled_output)

          loss_fct = nn.MSELoss()
          loss = loss_fct(logits.view(-1).to(torch.float32), labels.view(-1).to(torch.float32))

          return loss, logits

# Test & Evaluation
Possible improvements
1. Classification-layer structure : pooling, dropout, linear, fully-connected, loss function, etc.
2. Dataset size, test/train split, batch size, Epoch's, etc.
3. Hyperparameters - learning rate, dropout, etc.

In [9]:
optimizer = Adam(model.parameters(), lr=1e-6)
model_custom = CustomBERTModel()
model_custom.to(device)

total_loss = 0
total_len = 0
total_correct = 0
X=1

model_custom.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        MAX_LEN = max(len(e) for e in encoded_list)
        padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model_custom(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=X)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0
        itr+=1

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  labels = torch.tensor(label)
  pred = torch.argmax(F.softmax(logits), dim=X)


NotImplementedError: The operator 'aten::native_layer_norm_backward' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [None]:
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

for text, label in test_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    MAX_LEN = max(len(e) for e in encoded_list)
    padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model_custom(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)