In [1]:
!pip install torchinfo
!pip install torchmetrics
!pip install transformers

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Collecting torchmetrics
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.1 torchmetrics-1.3.1


In [2]:
import os
import time
import warnings
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary
from torchmetrics.classification import Accuracy, AUROC
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [4]:
warnings.filterwarnings("ignore")

In [5]:
%cd "/content/drive/MyDrive/大專生計畫/data"

/content/drive/MyDrive/大專生計畫/data


In [6]:
train_path = "processed_train.jsonl"
dev_path = "processed_dev.jsonl"
train_ex_path = "mem_train.json"
dev_ex_path = "mem_test.json"

In [7]:
train_df = pd.read_json(train_path, lines=True)
dev_df = pd.read_json(dev_path, lines=True)
ex_train_df = pd.read_json(train_ex_path)
ex_dev_df = pd.read_json(dev_ex_path)

train_df.head()

Unnamed: 0,id,img,label,text,caption
0,42953,img/42953.png,0,its their character not their color that matters,a man with a bald head and a bald face
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,a man and woman kissing in front of a lake
2,13894,img/13894.png,0,putting bows on your pet,a cat with a red bow on its neck
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,"a dog with a capt that says, exitouris ha sures"
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",a man in a suit and tie is standing in front o...


In [8]:
ex_train_df["text"] = train_df["text"]
ex_dev_df["text"] = dev_df["text"]
ex_train_df["caption"] = train_df["caption"]
ex_dev_df["caption"] = dev_df["caption"]

ex_dev_df.head()

Unnamed: 0,clean_sent,org_sent,bert_tokens,label,img,race,entity,ent_tokens,race_tokens,attack,text,caption
0,white people is this a shooting range,white people is this a shooting range,"[101, 2317, 2111, 2003, 2023, 1037, 5008, 2846...",1,08291.png,Black Male Black Male Black Male,Peanut allergy Family Parent Health Gender ide...,"[101, 21443, 2035, 24395, 2155, 6687, 2740, 59...","[101, 2304, 3287, 2304, 3287, 2304, 3287, 102]","[1, 0, 0, 0, 0]",white people is this a shooting range,two children sitting on the ground eating fruit
1,bravery at its finest,bravery at its finest,"[101, 16534, 2012, 2049, 10418, 102]",1,46971.png,White Male Black Male Black Male Black Male Bl...,Fried chicken Chicken glasses,"[101, 13017, 7975, 7975, 7877, 102]","[101, 2317, 3287, 2304, 3287, 2304, 3287, 2304...","[1, 0, 0, 0, 0]",bravery at its finest,a man with a beard and a shirt on
2,your order comes to 3750 and your white privil...,your order comes to $37.50 and your white priv...,"[101, 2115, 2344, 3310, 2000, 18034, 2692, 199...",1,03745.png,White Female White Female White Female,unk,"[101, 4895, 2243, 102]","[101, 2317, 2931, 2317, 2931, 2317, 2931, 102]","[1, 0, 0, 0, 0]",your order comes to $37.50 and your white priv...,a woman sitting at a desk with a laptop
3,it is time to send these parasites back to the...,it is time.. to send these parasites back to t...,"[101, 2009, 2003, 2051, 2000, 4604, 2122, 2399...",1,83745.png,unk,Crusades Middle Ages Knight First Crusade Knig...,"[101, 16282, 2015, 2690, 5535, 5000, 2034, 162...","[101, 4895, 2243, 102]","[1, 0, 0, 0, 0]",it is time.. to send these parasites back to t...,a person holding a bottle of beer
4,mississippi wind chime,mississippi wind chime,"[101, 5900, 3612, 9610, 4168, 102]",1,80243.png,unk,World War II 20 July plot Nazism Execution Ges...,"[101, 2088, 2162, 2462, 2322, 2251, 5436, 1315...","[101, 4895, 2243, 102]","[1, 0, 0, 0, 0]",mississippi wind chime,a black and white photo of a group of men walk...


In [23]:
class FinetunedDataset(Dataset):

    def __init__(self, df, tokenizer, **configs):
        self.dataset = []

        for i, row in df.iterrows():
            meta_data = {
                "guid": i,
                "label": row["label"],
            }
            sentences = [row["text"]] + [row[column_name] for column_name in configs.values()]
            meta_data["inputs"] = tokenizer(" [SEP] ".join(sentences), padding="max_length", truncation=True)

            self.dataset.append(meta_data)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        item = {
            k: torch.tensor(v) for k, v in self.dataset[index]["inputs"].items()
        }
        item['labels'] = torch.tensor(self.dataset[index]["label"])
        return item

In [35]:
use_cuda = True
batch_size = 8
n_epochs = 3
lr = 1e-5
loss_fn = torch.nn.CrossEntropyLoss()

In [46]:
plm_name = "GroNLP/hateBERT"
n_class = 2
configs = {"feat_1": "caption", "feat_2": "race", "feat_3": "entity"}

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(plm_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(plm_name)

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [38]:
# dataset
train_dataset = FinetunedDataset(ex_train_df, tokenizer, **configs)
dev_dataset = FinetunedDataset(ex_dev_df, tokenizer, **configs)

# data_loader
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
dev_loader = DataLoader(
    dev_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [48]:
if use_cuda:
    model = model.cuda()

In [49]:
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
lr_schedular = CosineAnnealingLR(optimizer, 150)

for epoch in range(n_epochs):
    tot_loss = 0
    for step, batch in enumerate(tqdm(train_loader, total=len(train_loader))):
        model.train()
        # zero gradients
        optimizer.zero_grad()

        if use_cuda:
            batch = {k: v.cuda() for k, v in batch.items()}

        logits = model(**batch).logits
        loss = loss_fn(logits, batch["labels"])
        loss.backward()
        tot_loss += loss.item()

        optimizer.step()

        if step % 250 == 1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)
        lr_schedular.step()

  0%|          | 0/1063 [00:00<?, ?it/s]

Epoch 0, average loss: 0.7072583138942719
Epoch 0, average loss: 0.6554658543022852
Epoch 0, average loss: 0.6540730001916923
Epoch 0, average loss: 0.6525099138392413
Epoch 0, average loss: 0.6509516078554941


  0%|          | 0/1063 [00:00<?, ?it/s]

Epoch 1, average loss: 0.7452305257320404
Epoch 1, average loss: 0.6423709359433916
Epoch 1, average loss: 0.6309201506623234
Epoch 1, average loss: 0.6263230353435303
Epoch 1, average loss: 0.624056902117358


  0%|          | 0/1063 [00:00<?, ?it/s]

Epoch 2, average loss: 0.6242082417011261
Epoch 2, average loss: 0.5611503022530723
Epoch 2, average loss: 0.5557011464736851
Epoch 2, average loss: 0.5528222828746793
Epoch 2, average loss: 0.5532534041120383


In [50]:
total_probs = []
total_targets = []

model.eval()
with torch.no_grad():
    for batch in tqdm(dev_loader, total=len(dev_loader)):
        if use_cuda:
            batch = {k: v.cuda() for k, v in batch.items()}
        logits = model(**batch).logits
        loss = loss_fn(logits, batch["labels"])
        total_probs.extend(list(torch.softmax(logits, dim=-1).detach().cpu()))
        total_targets.extend(list(batch["labels"].detach().cpu()))

  0%|          | 0/63 [00:00<?, ?it/s]

In [51]:
metrics = {
    "acc": Accuracy(task="multiclass", num_classes=2),
    "auroc": AUROC(task="multiclass", num_classes=2)
}
print("  Acc:", metrics['acc'](torch.stack(total_probs), torch.stack(total_targets)))
print("Auroc:", metrics['auroc'](torch.stack(total_probs), torch.stack(total_targets)))

  Acc: tensor(0.5600)
Auroc: tensor(0.6242)


In [52]:
import gc
gc.collect()
torch.cuda.empty_cache()