In [1]:
import time
import torch.utils.data as Data
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from EDA.DataReader import DataReader
import numpy as np
from tqdm import tqdm
from torch.optim import AdamW
from transformers import get_scheduler
import torch 
from torcheval.metrics.functional import multiclass_f1_score
import warnings


In [2]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [3]:
def custom_tokenizer(text):
    return text.split()

In [4]:
# pre-processing data
dataset = DataReader("UIT_VFSC") # UIT
# dataset = DataReader("") # dataset foody_raw
df_train = dataset.df_train
df_test = dataset.df_test
df_total =dataset.df_total
n_labels = int(df_total["label"].max().item() + 1)

In [5]:
# setup model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=n_labels)
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=1000
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_range = range(len(df_train))
test_range = range(len(df_train), len(df_total))
training_mask = np.zeros(len(df_total))
training_mask[train_range] = 1

test_mask = np.zeros(len(df_total))
test_mask[test_range] = 1

training_mask = training_mask.astype(np.bool_)

test_mask = test_mask.astype(np.bool_)

In [7]:
# 
inputs = tokenizer(df_total["corpus"].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=256)
input_ids_total = inputs['input_ids']
attention_mask_total = inputs['attention_mask']
input_ids = input_ids_total
attention_mask = attention_mask_total
dataloader = Data.DataLoader(
        Data.TensorDataset(input_ids[training_mask], attention_mask[training_mask]),
        batch_size=64
    )
indices = np.arange(len(df_total))

In [9]:
# split batch
batch_labels = []
for i, batch in tqdm(enumerate(dataloader)):
    input_ids, attention_mask = batch
    batch_start = i * 64
    batch_end = batch_start + len(input_ids)
    batch_indices = indices[batch_start:batch_end]
    labels = [df_total["label"].tolist()[idx] for idx in batch_indices]
    if len(labels) < 64:
        labels += [0] * (64 - len(labels))
    batch_labels.append(labels)
batch_labels = torch.tensor(batch_labels)
print(batch_labels.shape)

179it [00:01, 129.11it/s]

torch.Size([179, 64])





In [10]:
# evaluate model
def evaluate(model, input_ids, attention_mask):
    model.eval()
    with torch.no_grad():
        labels = df_total["label"][test_mask].tolist()
        output = model(input_ids, attention_mask)
        logits = output.logits
        logits = torch.argmax(logits, dim=1)
        labels = torch.tensor(labels)
        correct = torch.sum(logits == labels)

        acc = correct.item() * 1.0 / len(labels)
        mf1 = multiclass_f1_score(logits.type(torch.long), labels.type(torch.long), num_classes=n_labels, average='macro')
        wf1 = multiclass_f1_score(logits.type(torch.long), labels.type(torch.long), num_classes=n_labels, average='weighted')
        return acc, mf1, wf1

In [11]:
dur = []
max_acc = 0
max_f1 = 0
for epoch in range(10):
    if epoch >= 3:
        t0 = time.time()
    model.train()
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch:05d}"):
        input_ids_train, attention_mask_train = [x for x in batch]
        labels_train = batch_labels[i]
        output = model(input_ids_train, attention_mask_train)
        logits = torch.Tensor(output.logits)
        labels_train = labels_train[:len(input_ids_train)]
        logits = logits[:len(input_ids_train)]
        loss = F.cross_entropy(logits.to(torch.float32), labels_train.type(torch.LongTensor))
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    if epoch >= 3:
        dur.append(time.time() - t0)
    
    acc, mf1, wf1 = evaluate(model, input_ids_total[test_mask], attention_mask_total[test_mask])
    print(f"Epoch {epoch:05d} | Loss {loss.item():.4f} | Test Acc {acc:.4f} | mF1 {mf1:.4f} | wF1 {wf1:.4f}| Time(s) {np.mean(dur):.4f}")

    if acc > max_acc:
        max_acc = acc
    max_f1 = max(max(max_f1, wf1), mf1)
    torch.save(model.state_dict(), "model.pth")

Epoch 00000:   6%|▌         | 11/179 [36:01<9:10:16, 196.53s/it] 

KeyboardInterrupt

