In [None]:
import random

import numpy as np
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW, get_scheduler
from vncorenlp import VnCoreNLP

import loss
from CustomSoftmaxModel import CustomModelSoftmax
from metrics import metric
from preprocessing.NewsPreprocessing import Preprocess
from utils import pred_to_label, update_model
from visualization import Visualization

In [None]:
seed = 19133022
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
rdrsegmenter = VnCoreNLP("preprocessing/vncorenlp/VnCoreNLP-1.1.1.jar",
                         annotators="wseg", max_heap_size='-Xmx500m')
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
# Load datasets & Preprocess
preprocess = Preprocess(tokenizer, rdrsegmenter)
inputs = {'train': r"./data/training_data/train_datasets.csv",
          'test': r"./data/training_data/test_datasets.csv"}
tokenized_datasets = preprocess.run(load_dataset('csv', data_files=inputs))

In [None]:
num_epochs = 10
learning_rate = 5e-5
batch_size = 32

In [None]:
# Data loader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, collate_fn=data_collator, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)
# Model
phobert = CustomModelSoftmax("vinai/phobert-base")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
phobert.to(device)
# Optimizer
optimizer = AdamW(phobert.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
def evaluation(model, val_dataloader):
    valid = metric()
    model.eval()
    with torch.no_grad():
        for batch in val_dataloader:
            inputs = {'input_ids': batch['input_ids'].to(device),
                      'attention_mask': batch['attention_mask'].to(device)}
            outputs_classifier, outputs_regressor = model(**inputs)
            # loss
            classifier_loss = loss.classifier(outputs_classifier, batch['labels_classifier'].to(device).float())
            softmax_loss = loss.softmax(outputs_regressor, batch['labels_regressor'].to(device).float(), device)
            mix_loss = classifier_loss + softmax_loss
            outputs_classifier = outputs_classifier.cpu().numpy()
            outputs_regressor = outputs_regressor.cpu().numpy()
            outputs_regressor = outputs_regressor.argmax(axis=-1) + 1
            outputs = pred_to_label(outputs_classifier, outputs_regressor)
            # update loss
            y_true = batch['labels_regressor'].numpy()
            valid.classifier_loss.update(classifier_loss.item())
            valid.regressor_loss.update(softmax_loss.item())
            valid.loss.update(mix_loss.item())
            valid.acc.update(np.round(outputs), y_true)
            valid.f1_score.update(np.round(outputs), y_true)
            valid.r2_score.update(np.round(outputs), y_true)
    return valid

In [None]:
train_log = Visualization()
val_log = Visualization()
best_score = -1
for epoch in range(num_epochs):
    train_metrics = metric()
    phobert.train()
    for batch in train_dataloader:
        inputs = {'input_ids': batch['input_ids'].to(device),
                  'attention_mask': batch['attention_mask'].to(device)}
        outputs_classifier, outputs_regressor = phobert(**inputs)
        sigmoid_focal_loss = loss.sigmoid_focal(outputs_classifier, 
                                                batch['labels_classifier'].to(device).float(),
                                                alpha=-1, gamma=1, reduction='mean')
        softmax_loss = loss.softmax(outputs_regressor, 
                                    batch['labels_regressor'].to(device).float(), device)
        mix_loss = 10 * sigmoid_focal_loss + softmax_loss
        optimizer.zero_grad()
        mix_loss.backward()
        optimizer.step()
        with torch.no_grad():
            outputs_classifier = outputs_classifier.cpu().numpy()
            outputs_regressor = outputs_regressor.cpu().numpy()
            outputs_regressor = outputs_regressor.argmax(axis=-1) + 1
            outputs = pred_to_label(outputs_classifier, outputs_regressor)
            y_true = batch['labels_regressor'].numpy()
            train_metrics.sigmoid_focal_loss.update(sigmoid_focal_loss.item())
            train_metrics.regressor_loss.update(softmax_loss.item())
            train_metrics.loss.update(mix_loss.item())
            train_metrics.acc.update(np.round(outputs), y_true)
            train_metrics.f1_score.update(np.round(outputs), y_true)
            train_metrics.r2_score.update(np.round(outputs), y_true)


In [17]:
output = {
        "review": "sentence",
        "results": {}
    }

factors = {"category": ["Không xác định",
                        "Nông sản lúa, gạo, nếp... hoặc từ sản phẩm lúa", 
                        "Nông sản cà phê", "Nông sản cao su"],
           "price": ["Không xác định", "Giảm", "Ổn định", "Tăng"], 
           "market":["Không xác định", 
                     "Nguồn cung lớn hơn nhu cầu", 
                     "Nguồn cung và cầu ổn định", 
                     "Nhu cầu lớn hơn Nguồn cung"], 
           "polices":["Không xác định", 
                      "Chính sách", "Hiệp định", "Khác"], 
           "internal":["Không xác định", "Liên quan đến sản lượng nông sản", 
                       "Liên quan đến chất lượng nông sản.", "Chi phí sản suất liên quan"], 
           "external":["Không xác định", "Dịch bệnh", "Thiên tai", "Khủng hoảng"]
           }
predict_results = [0,1,2,3,0,1]
for i, r in enumerate(factors):
    output["results"][r] = factors[r][int(predict_results[i])]
output['category']

KeyError: 'category'