In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
import random

# from torchsummary import summary

In [3]:
# Number of classes in MNIST
num_classes = 4

# Number of training epochs (arbitrary)
epochs = 5

# Training parameters
learning_rate = 1e-6
batch_size = 16
display_step = 100

# Model path
# checkpoint = 'model.pth'

# device: cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
pip install transformers

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
# Install the vncorenlp python wrapper
!pip install vncorenlp

# Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter)
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25ldone
[?25h  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645932 sha256=842f2a38230f339772786a74627145cce4c88996b654e1ec3a896388d478693e
  Stored in directory: /root/.cache/pip/wheels/5d/d9/b3/41f6c6b1ab758561fd4aab55dc0480b9d7a131c6aaa573a3fa
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3
--2024-06-02 14:26:37--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubuse

In [7]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP("/kaggle/working/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

text = "Đại học Bách Khoa Hà Nội."

word_segmented_text = rdrsegmenter.tokenize(text)
print(word_segmented_text)

[['Đại_học', 'Bách_Khoa', 'Hà_Nội', '.']]


In [8]:
import re
import pandas as pd

train_path = 'https://raw.githubusercontent.com/lavibula/SentimentAnalysis-with-Vietnamese-reviews/main/absa_data/train.csv'
test_path = 'https://raw.githubusercontent.com/lavibula/SentimentAnalysis-with-Vietnamese-reviews/main/absa_data/val.csv'

train_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

train_set['Sentence'] = train_set['Sentence'].apply(rdrsegmenter.tokenize).apply(lambda x: ' '.join(x[0]))
test_set['Sentence'] = test_set['Sentence'].apply(rdrsegmenter.tokenize).apply(lambda x: ' '.join(x[0]))

train_set = pd.concat([train_set, test_set]).reset_index().drop(columns = 'index')

In [9]:
df = pd.concat([train_set, test_set]).reset_index().drop(columns = 'index')
df

Unnamed: 0,Sentence,FACILITY,LECTURER,OTHERS,TRAINING_PROGRAM
0,slide giáo_trình đầy_đủ,0,0,0,1
1,nhiệt_tình giảng_dạy gần_gũi với sinh_viên,0,1,0,0
2,đi học đầy_đủ full điểm chuyên_cần,0,0,0,2
3,chưa áp_dụng công_nghệ_thông_tin và các thiết_...,0,2,0,0
4,thầy giảng bài hay có nhiều bài_tập ví_dụ ngay...,0,1,0,0
...,...,...,...,...,...
14587,hướng_dẫn lab mơ_hồ,0,2,0,0
14588,thầy cho chúng_em những bài_tập mang tính thực...,0,1,0,0
14589,thầy không dạy nhiều chủ_yếu cho sinh_viên tự ...,0,2,0,0
14590,em muốn đổi tên môn_học vì tên môn là lập_trìn...,0,0,0,2


In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Sample data for illustration
# data = train_set[['FACILITY', 'LECTURER', 'OTHERS', 'TRAINING_PROGRAM']].copy()
data = df.copy()

# Create DataFrame
df = pd.DataFrame(data)

# Select specific columns
selected_columns = ['FACILITY', 'LECTURER', 'OTHERS', 'TRAINING_PROGRAM']
data_to_encode = df[selected_columns]

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(data_to_encode)

# Convert the result to a DataFrame for better readability
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(selected_columns))

# If you want to list the values
encoded_values_list = encoded_df.values
# print(encoded_values_list)



In [11]:
encoded_values_list

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [12]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [13]:
# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

In [14]:
features[0].shape

torch.Size([1, 7, 768])

In [15]:
features[0][0].shape

torch.Size([7, 768])

In [16]:
features.last_hidden_state[:, 0, :].shape

torch.Size([1, 768])

In [17]:
X = train_set['Sentence'][10]
X

'thầy rất tận_tình và đi dạy rất đúng giờ'

In [18]:
encoded_values_list.shape

(14592, 16)

In [19]:
# Tách features (câu đánh giá) và labels (cảm xúc)
X = df['Sentence']
y = encoded_values_list

# # Chuyển đổi dữ liệu text sang vector đặc trưng bằng CountVectorizer
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(X)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tính toán embeddings cho một câu đánh giá
def compute_embeddings(sentence):
    # Tokenize câu đánh giá và thêm token đặc biệt
    # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
#     sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

    input_ids = torch.tensor([tokenizer.encode(sentence)])

    with torch.no_grad():
        features = phobert(input_ids)  # Models outputs are now tuples
    
    # Lấy embeddings từ lớp cuối cùng (CLS token)
    cls_embedding = features.last_hidden_state[:, 0, :]
    
    return cls_embedding

# # Tính toán embeddings cho tất cả các câu đánh giá trong tập huấn luyện và tập kiểm tra
X_train_embeddings = torch.cat([compute_embeddings(sentence) for sentence in X_train], dim=0)
X_test_embeddings = torch.cat([compute_embeddings(sentence) for sentence in X_test], dim=0)

# # Tính toán embeddings cho tất cả các câu đánh giá trong tập huấn luyện và tập kiểm tra
# from tqdm import tqdm
# X_train_embeddings = []
# for sentence in tqdm(X_train[0:10]):
#     X_train_embeddings.append(compute_embeddings(sentence))

In [20]:
y_train = y[:len(X_train_embeddings)]

In [21]:
y_test = y[len(X_train_embeddings):]

In [22]:
X_test_embeddings.shape

torch.Size([2919, 768])

In [23]:
y_test.shape

(2919, 16)

In [24]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
train_labels = torch.tensor(y_train)
train_data = TensorDataset(X_train_embeddings, train_labels)
# train_sampler = SequentialSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle = True)

In [25]:
val_labels = torch.tensor(y_test)
val_data = TensorDataset(X_test_embeddings, val_labels)
# val_sampler = SequentialSampler(val_data)
# val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [119]:
import wandb

# Log in to wandb programmatically (if not already logged in via CLI)
wandb.login(key="8b8539216f9db012b822d7152559a24eb9f66ac2")

# Initialize a new wandb run
run = wandb.init(project="ABSA-Dense-project")



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Eval/Accuracy,▁▂▃▄▆▇█
Eval/Loss,█▇▆▄▃▂▁
Train/Accuracy,▁▂▃▅▆▇█
Train/Loss,█▇▅▄▃▂▁

0,1
Eval/Accuracy,0.62547
Eval/Loss,0.10976
Train/Accuracy,0.52377
Train/Loss,0.10993


In [27]:
# Define the ANN model
class ANN(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(hidden_dim2, output_dim)
        self.sigmoid = nn.Sigmoid()  # For multi-label classification
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [120]:
# Hyperparameters
input_dim = X_train_embeddings.shape[1]
hidden_dim1 = 600
hidden_dim2 = 300
output_dim = 16  # Assuming 16 output classes (4 labels, each with 4 classes)

# Initialize the model
model = ANN(input_dim, hidden_dim1, hidden_dim2, output_dim).to(device)

In [29]:
import torch
from torch import nn
import torch.nn.functional as F

class CapsuleLoss(nn.Module):

    def __init__(self, smooth=0.1, lamda=0.6):
        super(CapsuleLoss, self).__init__()
        self.smooth = smooth
        self.lamda = lamda

    def forward(self, input, target):
        one_hot = torch.zeros_like(input).to(input.device)
        one_hot = one_hot.scatter(1, target.unsqueeze(-1), 1)
        a = torch.max(torch.zeros_like(input).to(input.device), 1 - self.smooth - input)
        b = torch.max(torch.zeros_like(input).to(input.device), input - self.smooth)
        loss = one_hot * a * a + self.lamda * (1 - one_hot) * b * b
        loss = loss.sum(dim=1, keepdim=False)
        return loss.mean()

# CrossEntropyLoss for Label Smoothing Regularization
class CrossEntropyLoss_LSR(nn.Module):
    def __init__(self, para_LSR=0.2):
        super(CrossEntropyLoss_LSR, self).__init__()
        self.para_LSR = para_LSR
        self.logSoftmax = nn.LogSoftmax(dim=-1)

    def _toOneHot_smooth(self, label, batchsize, classes):
        prob = self.para_LSR * 1.0 / classes
        one_hot_label = torch.zeros(batchsize, classes) + prob
        for i in range(batchsize):
            index = label[i]
            one_hot_label[i, index] += (1.0 - self.para_LSR)
        return one_hot_label

    def forward(self, pre, label, size_average=True):
        b, c = pre.size()
        one_hot_label = self._toOneHot_smooth(label, b, c).to(pre.device)
        loss = torch.sum(-one_hot_label * self.logSoftmax(pre), dim=1)
        if size_average:
            return torch.mean(loss)
        else:
            return torch.sum(loss)

class SmoothCrossEntropy(nn.Module):

    def __init__(self, smooth=0.08):
        super(SmoothCrossEntropy, self).__init__()
        self.kldiv = nn.KLDivLoss()
        self.smooth = smooth

    def forward(self, input, target):
        one_hot = torch.zeros_like(input).to(input.device)
        one_hot = one_hot.scatter(1, target.unsqueeze(-1), 1)
        target = (1 - self.smooth) * one_hot + self.smooth / (input.size(1) - 1) * (1 - one_hot)
        # target = target + torch.rand_like(target).to(target.device) * 0.001
        input = input - input.max(dim=1, keepdim=True)[0]
        loss = -target * F.log_softmax(input, dim=-1)
        return loss.mean()

In [121]:
import random
from tqdm import tqdm_notebook
import torch
import torch.nn.functional as F
from transformers import AdamW

device = 'cuda'
epochs = 9

param_optimizer = list(model.named_parameters())
no_decay = []
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

criterion = CapsuleLoss()
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-8, correct_bias=False)


for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    
    model.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    total_loss = 0
    total_samples = 0
    correct_samples = 0
    
    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
        b_labels = batch[1].to(device)

        model.zero_grad()
        outputs = model(b_input_ids)

        logits = outputs.clone()  # keep this tensor on the GPU
        logits = logits.view(-1, 4, 4)
        label_ids = b_labels.view(-1, 4, 4)
        
        # Convert one-hot encoded labels to class indices
        label_ids_indices = torch.argmax(label_ids, dim=-1)

        # Reshape logits and label_ids
        batch_size, num_capsules, num_classes = logits.shape
        logits_flat = logits.view(batch_size * num_capsules, num_classes)
        label_ids_flat = label_ids_indices.view(batch_size * num_capsules)

        # Calculate loss
        loss = criterion(logits_flat, label_ids_flat)
        total_loss += batch_size * loss.item()
        total_samples += batch_size * num_classes

        pred = logits_flat.argmax(dim=1)
        correct_samples += (label_ids_flat == pred).long().sum().item()
#         print(correct_samples)
#         print(total_samples)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
#         break
        
    train_loss = total_loss / total_samples
    train_accuracy = correct_samples / total_samples

    print(f" Accuracy: {train_accuracy:.4f}")
    print(f" Average training loss: {train_loss:.4f}")

    print("Running Validation...")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    total_loss = 0
    total_samples = 0
    correct_samples = 0
    
    for batch in tqdm_notebook(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids)
            
            logits = outputs.clone()  # keep this tensor on the GPU
            logits = logits.view(-1, 4, 4)
            label_ids = b_labels.view(-1, 4, 4)

            # Convert one-hot encoded labels to class indices
            label_ids_indices = torch.argmax(label_ids, dim=-1)

            # Reshape logits and label_ids
            batch_size, num_capsules, num_classes = logits.shape
            logits_flat = logits.view(batch_size * num_capsules, num_classes)
            label_ids_flat = label_ids_indices.view(batch_size * num_capsules)

            # Calculate loss
            loss = criterion(logits_flat, label_ids_flat)
            total_loss += batch_size * loss.item()
            total_samples += batch_size * num_classes

            pred = logits_flat.argmax(dim=1)
            correct_samples += (label_ids_flat == pred).long().sum().item()
#             break
            
    eval_loss = total_loss / total_samples
    eval_accuracy = correct_samples / total_samples

    print(f" Accuracy: {eval_accuracy:.4f}")
    print(f" Average evaluating loss: {eval_loss:.4f}")
    
    wandb.log({"Train/Accuracy": train_accuracy, 
               "Train/Loss": train_loss,
               "Eval/Accuracy": eval_accuracy,
               "Eval/Loss": eval_loss,
              })

    save_path = f"state_dict_model_{epoch_i + 1}.pt"
    torch.save(model.state_dict(), save_path)

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_dataloader)):


0it [00:00, ?it/s]

 Accuracy: 0.2766
 Average training loss: 0.1113
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(val_dataloader):


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.2775
 Average evaluating loss: 0.1108
Training...


0it [00:00, ?it/s]

 Accuracy: 0.3177
 Average training loss: 0.1106
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.3232
 Average evaluating loss: 0.1102
Training...


0it [00:00, ?it/s]

 Accuracy: 0.3670
 Average training loss: 0.1100
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.4103
 Average evaluating loss: 0.1096
Training...


0it [00:00, ?it/s]

 Accuracy: 0.4188
 Average training loss: 0.1095
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.5044
 Average evaluating loss: 0.1091
Training...


0it [00:00, ?it/s]

 Accuracy: 0.4695
 Average training loss: 0.1090
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.5919
 Average evaluating loss: 0.1086
Training...


0it [00:00, ?it/s]

 Accuracy: 0.5264
 Average training loss: 0.1086
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.6570
 Average evaluating loss: 0.1082
Training...


0it [00:00, ?it/s]

 Accuracy: 0.5784
 Average training loss: 0.1081
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.6955
 Average evaluating loss: 0.1077
Training...


0it [00:00, ?it/s]

 Accuracy: 0.6225
 Average training loss: 0.1076
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.7121
 Average evaluating loss: 0.1072
Training...


0it [00:00, ?it/s]

 Accuracy: 0.6569
 Average training loss: 0.1071
Running Validation...


  0%|          | 0/183 [00:00<?, ?it/s]

 Accuracy: 0.7178
 Average evaluating loss: 0.1067


## Evaluate

In [31]:
import re
import pandas as pd

# train_path = 'https://raw.githubusercontent.com/lavibula/SentimentAnalysis-with-Vietnamese-reviews/main/absa_data/train.csv'
test_path = 'https://raw.githubusercontent.com/lavibula/SentimentAnalysis-with-Vietnamese-reviews/main/absa_data/test.csv'

# train_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

# train_set['Sentence'] = train_set['Sentence'].apply(rdrsegmenter.tokenize).apply(lambda x: ' '.join(x[0]))
test_set['Sentence'] = test_set['Sentence'].apply(rdrsegmenter.tokenize).apply(lambda x: ' '.join(x[0]))

# train_set = pd.concat([train_set, test_set]).reset_index().drop(columns = 'index')

In [32]:
df = test_set.copy()

In [33]:
df

Unnamed: 0,Sentence,FACILITY,LECTURER,OTHERS,TRAINING_PROGRAM
0,nói tiếng anh lưu_loát,0,1,0,0
1,giáo_viên rất vui_tính,0,1,0,0
2,cô max có_tâm,0,1,0,0
3,giảng bài thu_hút dí_dỏm,0,1,0,0
4,giáo_viên không giảng_dạy kiến_thức hướng_dẫn ...,0,2,0,0
...,...,...,...,...,...
3161,các slide khó hiểu ngôn_ngữ trong slide phức_t...,0,2,0,0
3162,giáo_viên giảng_dạy có tâm_huyết,0,1,0,0
3163,chia_sẻ cho em nhiều điều hay,0,1,0,0
3164,em tiếp_thu chậm,0,2,0,0


In [34]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Sample data for illustration
# data = train_set[['FACILITY', 'LECTURER', 'OTHERS', 'TRAINING_PROGRAM']].copy()
data = df.copy()

# Create DataFrame
df = pd.DataFrame(data)

# Select specific columns
selected_columns = ['FACILITY', 'LECTURER', 'OTHERS', 'TRAINING_PROGRAM']
data_to_encode = df[selected_columns]

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(data_to_encode)

# Convert the result to a DataFrame for better readability
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(selected_columns))

# If you want to list the values
encoded_values_list = encoded_df.values
# print(encoded_values_list)



In [35]:
encoded_values_list.shape

(3166, 16)

In [36]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

In [38]:
# Tách features (câu đánh giá) và labels (cảm xúc)
X = df['Sentence']
y = encoded_values_list

# Tính toán embeddings cho một câu đánh giá
def compute_embeddings(sentence):
    # Tokenize câu đánh giá và thêm token đặc biệt
    # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
#     sentence = 'Chúng_tôi là những nghiên_cứu_viên .'  

    input_ids = torch.tensor([tokenizer.encode(sentence)])

    with torch.no_grad():
        features = phobert(input_ids)  # Models outputs are now tuples
    
    # Lấy embeddings từ lớp cuối cùng (CLS token)
    cls_embedding = features.last_hidden_state[:, 0, :]
    
    return cls_embedding

# # Tính toán embeddings cho tất cả các câu đánh giá trong tập huấn luyện và tập kiểm tra
inputs = torch.cat([compute_embeddings(sentence) for sentence in X], dim=0)


In [39]:
inputs.shape

torch.Size([3166, 768])

In [122]:
device = 'cuda'

model.eval()
# eval_loss, eval_accuracy = 0, 0
# nb_eval_steps, nb_eval_examples = 0, 0
# eval_f1 = 0
# total_loss = 0
# total_samples = 0
# correct_samples = 0

# b_input_ids, b_input_mask = inputs['input_ids'], inputs['attention_mask']
b_labels = torch.tensor(y)
# b_input_ids = b_input_ids.to(device)
# b_input_mask = b_input_mask.to(device)
b_labels = b_labels.to(device)
inputs = inputs.to(device)
logits = []
for i in range(len(b_labels)):
    with torch.no_grad():
        outputs = model(inputs[i])
        logit = outputs.clone()  # keep this tensor on the GPU
        logits.append(list(np.array(logit.cpu())))
#         if i == 2:
#             break

In [53]:
# logits

In [126]:
def correct_sample(logits, b_labels):
    logits = torch.tensor(logits).reshape(-1,4,4)
    label_ids = b_labels.view(-1, 4, 4)

    # Convert one-hot encoded labels to class indices
    label_ids_indices = torch.argmax(label_ids, dim=-1).cpu()

    # Reshape logits and label_ids
    batch_size, num_capsules, num_classes = logits.shape
    logits_flat = logits.view(batch_size * num_capsules, num_classes)
    print(label_ids_indices)
    label_ids_flat = label_ids_indices.view(batch_size * num_capsules)

    # Calculate loss
    # loss = criterion(logits_flat, label_ids_flat)
    # total_loss += batch_size * loss.item()
    # total_samples += batch_size * num_classes

    pred = logits_flat.argmax(dim=1)
    correct_samples = (label_ids_flat == pred).long().sum().item()
    return correct_samples

correct_sample(logits, b_labels)

tensor([[0, 1, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        ...,
        [0, 1, 0, 0],
        [0, 2, 0, 0],
        [0, 0, 0, 3]])


9074

In [56]:
(len(b_labels) * 4)

12664

In [123]:
def return_preds_and_labels(logits, b_labels):
    logits = torch.tensor(logits).reshape(-1,4,4)
    label_ids = b_labels.view(-1, 4, 4)

    # Convert one-hot encoded labels to class indices
    label_ids_indices = torch.argmax(label_ids, dim=-1).cpu()

    # Reshape logits and label_ids
    batch_size, num_capsules, num_classes = logits.shape
    logits_flat = logits.view(batch_size * num_capsules, num_classes)
    label_ids_flat = label_ids_indices.view(batch_size * num_capsules)

    # Calculate loss
    # loss = criterion(logits_flat, label_ids_flat)
    # total_loss += batch_size * loss.item()
    # total_samples += batch_size * num_classes
    pred = logits_flat.argmax(dim=1)
    return pred, label_ids_flat

pred, label = return_preds_and_labels(logits, b_labels)

In [124]:
from sklearn.metrics import classification_report
print(classification_report(label, pred))
# 0: None, 1: 'Positive', 2: 'Negative', 3: 'Normal'

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      9498
           1       0.47      0.04      0.08      1590
           2       0.22      0.29      0.25      1409
           3       0.02      0.16      0.04       167

    accuracy                           0.72     12664
   macro avg       0.40      0.35      0.32     12664
weighted avg       0.77      0.72      0.72     12664



In [125]:
# topic
# 'FACILITY', 'LECTURER', 'OTHERS', 'TRAINING_PROGRAM'
print(classification_report(label.reshape(-1,4).argmax(dim = 1), pred.reshape(-1,4).argmax(dim = 1)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       145
           1       0.73      1.00      0.84      2290
           2       0.26      0.03      0.06       159
           3       0.00      0.00      0.00       572

    accuracy                           0.72      3166
   macro avg       0.25      0.26      0.22      3166
weighted avg       0.54      0.72      0.61      3166



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
