Test #1. Pre-Trained BERT Classifier / Misspelled Text

In [95]:
!pip install pytorch-transformers



In [96]:
!pip install nlpaug



In [97]:
!git clone https://github.com/joseph1723/CS376_Final_Project.git

fatal: destination path 'CS376_Final_Project' already exists and is not an empty directory.


In [98]:
import nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.word.context_word_embs as nawcwe
import nlpaug.augmenter.word.word_embs as nawwe
import nlpaug.augmenter.word.spelling as naws

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

In [99]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
if torch.cuda.is_available() :
  device = torch.device("cuda")
  model.to(device)
else :
  device = torch.device("cpu")
  model.to(device)

In [100]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    label = self.df.iloc[idx, 1]
    return text, label

Hyperparameters

In [110]:
train_size = 1000
test_size = 100
itr = 1
p_itr = 100
epochs = 1

In [112]:
total_df = pd.read_csv('/content/CS376_Final_Project/2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv', sep=',')
total_df.dropna(inplace=True)

#Train Set - 50
train_df = total_df.sample(train_size, random_state=999)
train_df = train_df[["text", "label"]]
train_df["label"] = [1 if i == "nothate" else 0 for i in train_df["label"]]

#Test Set - 10
test_df = total_df.sample(test_size, random_state=999)
test_df = test_df[["text", "label"]]
test_df["label"] = [1 if i == "nothate" else 0 for i in test_df["label"]]


In [113]:
train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)

test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2)

Data Augmentation / Generating Typo using nlpaug

In [114]:
def augment_data(aug, dataframe, key, N):
  dataframe_copy = dataframe.copy()
  rows = []
  for i, row in dataframe.iterrows() :
    line = row[key]
    line_augmented = aug.augment(line)
    row.iloc[0] = line_augmented
    row_list = list(row)
    rows.append(row_list)
  rows = pd.DataFrame(rows, columns = ['text', 'label'])
  dataframe_copy = dataframe_copy.append(pd.DataFrame(rows), ignore_index=True)
  return dataframe_copy

In [115]:
aug = naw.SpellingAug()
n = 1
key = "text"

train_aug_df = augment_data(aug, train_df, key, n)
test_aug_df = augment_data(aug, test_df, key, n)

In [116]:
train_dataset = TestDataset(train_aug_df)
test_dataset = TestDataset(test_aug_df)
#train_dataset = TestDataset(train_df)
#test_dataset = TestDataset(test_df)


train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2)

Test & Evaluation

*주의해야하는 것은, 학습 샘플의 인풋이 (batch_size, sequence_length)로 들어간다는 것이다. 따라서 zero-padding을 직접 해줘서 model의 forward에 넣어줘야한다.

In [117]:
optimizer = Adam(model.parameters(), lr=1e-6)

total_loss = 0
total_len = 0
total_correct = 0


model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        MAX_LEN = max(len(e) for e in encoded_list)
        padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0
        itr+=1






[Epoch 1/1] Iteration 100 -> Train Loss: 0.7016, Accuracy: 0.500
[Epoch 1/1] Iteration 200 -> Train Loss: 0.6677, Accuracy: 0.630
[Epoch 1/1] Iteration 300 -> Train Loss: 0.6135, Accuracy: 0.710
[Epoch 1/1] Iteration 400 -> Train Loss: 0.6781, Accuracy: 0.550
[Epoch 1/1] Iteration 500 -> Train Loss: 0.6552, Accuracy: 0.600
[Epoch 1/1] Iteration 600 -> Train Loss: 0.6684, Accuracy: 0.580
[Epoch 1/1] Iteration 700 -> Train Loss: 0.6541, Accuracy: 0.610
[Epoch 1/1] Iteration 800 -> Train Loss: 0.6474, Accuracy: 0.590
[Epoch 1/1] Iteration 900 -> Train Loss: 0.6539, Accuracy: 0.610
[Epoch 1/1] Iteration 1000 -> Train Loss: 0.6247, Accuracy: 0.640
[Epoch 1/1] Iteration 1100 -> Train Loss: 0.6393, Accuracy: 0.630
[Epoch 1/1] Iteration 1200 -> Train Loss: 0.6162, Accuracy: 0.660
[Epoch 1/1] Iteration 1300 -> Train Loss: 0.6365, Accuracy: 0.620
[Epoch 1/1] Iteration 1400 -> Train Loss: 0.6280, Accuracy: 0.650
[Epoch 1/1] Iteration 1500 -> Train Loss: 0.5784, Accuracy: 0.690
[Epoch 1/1] Iterati

In [118]:
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

for text, label in test_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    MAX_LEN = max(len(e) for e in encoded_list)
    padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)

  del sys.path[0]


Test accuracy:  0.7


TEST RESULTS :

No Augmentation - 0.78

Augmentation - 0.7