In [1]:
!pip install transformers
# !pip install nlpaug
!git clone https://github.com/joseph1723/CS376_Final_Project.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification, BertModel

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

if torch.cuda.is_available() :
  print("CUDA")
  device = torch.device("cuda")
  model.to(device)
else :
  print("CPU")
  device = torch.device("cpu")
  model.to(device)

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

CUDA


In [4]:
class TestDataset(Dataset) :
  #Dataset - English/typo-added/labeled
  def __init__(self, df) :
    self.df = df
  
  def __len__(self) :
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    label = self.df.iloc[idx, 1]
    return text, label

Hyperparameters

In [5]:
train_rate, test_rate = 0.9, 0.09
itr = 1
p_itr = 100
epochs = 5
batch = 10

In [6]:
total_df = pd.read_csv('/content/CS376_Final_Project/augmented_data/Dataset_aug_complex_10424_.csv', sep=',')
#total_df = pd.read_csv('augmented_data/Dataset_aug_complex_10424_original.csv', sep=',')

total_df.dropna(inplace=True)
total_df = total_df[["text", "label"]]
total_df["label"] = [1 if i == "nothate" else 0 for i in total_df["label"]]
print(total_df)
total_dataset = TestDataset(total_df)
total_loader = DataLoader(total_dataset, batch_size=batch, shuffle=True)

                                                    text  label
0         black widows focr up their spider husbands lol      1
1      hey this is a group of Pierced nipples . pleas...      1
2      hey buddy its the necrophiles but dont focus o...      1
3      every retard day i hear people chatting shitbl...      1
4                           going to the theatre is 8hit      1
...                                                  ...    ...
10419                         So happy I'm not a fag hag      0
10420                          you are a racist XrseTol4      1
10421                   bloody h$l., you bhckihg gossrr!      1
10422  since lockdown I have no money and there are n...      1
10423  every fVUoing day i hear people making kucKihg...      0

[10424 rows x 2 columns]


In [7]:
#Train Set
test_df, train_df, _ = np.split(total_df, [int(test_rate*len(total_df)), int(test_rate*len(total_df) + train_rate*len(total_df))])
print(len(test_df), len(train_df))
train_dataset = TestDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

938 9381


In [10]:
optimizer = Adam(model.parameters(), lr=1e-6)

total_loss = 0
total_len = 0
total_correct = 0
total_preds = []
X=1

model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        MAX_LEN = max(len(e) for e in encoded_list)
        padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels, return_dict=False)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=X)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()

        total_preds.append(pred)
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0
        itr+=1



[Epoch 1/5] Iteration 100 -> Train Loss: 0.4951, Accuracy: 0.546
[Epoch 1/5] Iteration 200 -> Train Loss: 0.6865, Accuracy: 0.558
[Epoch 1/5] Iteration 300 -> Train Loss: 0.6894, Accuracy: 0.557
[Epoch 1/5] Iteration 400 -> Train Loss: 0.6891, Accuracy: 0.530
[Epoch 1/5] Iteration 500 -> Train Loss: 0.6753, Accuracy: 0.565
[Epoch 1/5] Iteration 600 -> Train Loss: 0.6823, Accuracy: 0.563
[Epoch 1/5] Iteration 700 -> Train Loss: 0.6760, Accuracy: 0.582
[Epoch 1/5] Iteration 800 -> Train Loss: 0.6683, Accuracy: 0.593
[Epoch 1/5] Iteration 900 -> Train Loss: 0.6752, Accuracy: 0.585
[Epoch 2/5] Iteration 1000 -> Train Loss: 0.6701, Accuracy: 0.597
[Epoch 2/5] Iteration 1100 -> Train Loss: 0.6657, Accuracy: 0.592
[Epoch 2/5] Iteration 1200 -> Train Loss: 0.6608, Accuracy: 0.606
[Epoch 2/5] Iteration 1300 -> Train Loss: 0.6673, Accuracy: 0.607
[Epoch 2/5] Iteration 1400 -> Train Loss: 0.6658, Accuracy: 0.599
[Epoch 2/5] Iteration 1500 -> Train Loss: 0.6628, Accuracy: 0.604
[Epoch 2/5] Iterati

In [11]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.1-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 5.1 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.1


In [16]:
from torchmetrics import F1Score

model.eval()

total_loss = 0
total_len = 0
total_correct = 0
total_preds = []

for text, label in test_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    MAX_LEN = max(len(e) for e in encoded_list)
    padded_list =  [e + [0] * (MAX_LEN-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels, return_dict=False)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    total_preds.append(pred)

    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

labels = torch.tensor(test_df['label']).to(device)
total_preds = torch.cat(total_preds).to(device)
f1 = F1Score(num_classes=2).to(device)
F1_Score = f1(total_preds, labels)
print('F1 Score : ', F1_Score)

print('Test accuracy: ', total_correct / total_len)

  app.launch_new_instance()


F1 Score :  tensor(0.5320, device='cuda:0')
Test accuracy:  0.7302771855010661


In [None]:
!pip install torchmetrics

In [None]:
labels = 
F1_score = 

# Accuracy (epoch=5) : 0.76
# F1-Score : 0.532