In [1]:
import json
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [2]:
!pip install einops



In [3]:
with open('/kaggle/input/data-ds200/data_labeled_soft_cleaned.json', 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

def build_text(item):
    parts = [item.get("post_content", "")]
    if 'comment' in item:
        parts += item['comment'].get('parent_comment_texts', [])
        if item['comment'].get('comment_text'):
            parts.append(item['comment']['comment_text'])
    img_descs = [desc['image_description'] for desc in item.get("image_descriptions", [])]
    parts += img_descs
    return "\n".join(parts)

In [4]:
data = []
for item in raw_data:
    text = build_text(item)
    aspects = [item.get("Aspect_1", "Other"), item.get("Aspect_2") or "null"]
    sentiments = item.get("Sentiment", ["null", "null"])
    data.append({
        "text": text,
        "aspect_1": aspects[0],
        "aspect_2": aspects[1],
        "sentiment_1": sentiments[0],
        "sentiment_2": sentiments[1] if aspects[1] != "null" else "null"
    })

In [5]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,aspect_1,aspect_2,sentiment_1,sentiment_2
0,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative
1,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative
2,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Health,negative,negative
3,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Health,negative,negative
4,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative


In [6]:
all_aspects = pd.concat([df['aspect_1'], df['aspect_2']]).unique()
aspect_encoder = LabelEncoder()
aspect_encoder.fit(all_aspects)

df['aspect_1_enc'] = aspect_encoder.transform(df['aspect_1'])
df['aspect_2_enc'] = aspect_encoder.transform(df['aspect_2'])

In [7]:
print(aspect_encoder.classes_)

['Art' 'Fashion' 'Food' 'Health' 'Law' 'Other' 'Sport' 'null']


In [8]:
# sentiment_map = {"positive": 1.0, "neutral": 0.0, "negative": -1.0}
# df["sentiment_1_score"] = df["sentiment_1"].map(sentiment_map)
# df["sentiment_2_score"] = df["sentiment_2"].map(sentiment_map)

In [9]:
all_aspects = pd.concat([df['sentiment_1'], df['sentiment_2']]).unique()
sentiment_encoder = LabelEncoder()
sentiment_encoder.fit(all_aspects)

df['sentiment_1_enc'] = sentiment_encoder.transform(df['sentiment_1'])
df['sentiment_2_enc'] = sentiment_encoder.transform(df['sentiment_2'])

In [10]:
print(sentiment_encoder.classes_)

['negative' 'neutral' 'null' 'positive']


In [11]:
from collections import Counter
import numpy as np
import torch

def compute_class_weights(name, combined_labels):
    counts = Counter(combined_labels)
    total = sum(counts.values())
    print(f"\n{name} distribution:")
    for k, v in sorted(counts.items()):
        print(f"  Class {k}: {v} ({v/total:.2%})")

    num_classes = len(counts)
    weights = [total / (num_classes * counts[i]) for i in range(num_classes)]
    print(f"{name} weights: {np.round(weights, 3)}")
    return torch.tensor(weights, dtype=torch.float32)


In [12]:
# Gộp nhãn aspect
combined_aspect = pd.concat([df["aspect_1_enc"], df["aspect_2_enc"]]).tolist()
w_aspect = compute_class_weights("Aspect", combined_aspect)

# Gộp nhãn sentiment
combined_sentiment = pd.concat([df["sentiment_1_enc"], df["sentiment_2_enc"]]).tolist()
w_sentiment = compute_class_weights("Sentiment", combined_sentiment)



Aspect distribution:
  Class 0: 579 (4.85%)
  Class 1: 344 (2.88%)
  Class 2: 809 (6.77%)
  Class 3: 1894 (15.86%)
  Class 4: 2399 (20.09%)
  Class 5: 3294 (27.58%)
  Class 6: 624 (5.22%)
  Class 7: 2001 (16.75%)
Aspect weights: [2.579 4.34  1.845 0.788 0.622 0.453 2.393 0.746]

Sentiment distribution:
  Class 0: 6488 (54.32%)
  Class 1: 885 (7.41%)
  Class 2: 2001 (16.75%)
  Class 3: 2570 (21.52%)
Sentiment weights: [0.46  3.374 1.492 1.162]


In [13]:
df.head()

Unnamed: 0,text,aspect_1,aspect_2,sentiment_1,sentiment_2,aspect_1_enc,aspect_2_enc,sentiment_1_enc,sentiment_2_enc
0,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative,4,5,0,0
1,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative,4,5,0,0
2,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Health,negative,negative,4,3,0,0
3,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Health,negative,negative,4,3,0,0
4,TÒA ĐÃ TUYÊN PHẠT BỐ RUỘT 6 NĂM TÙ Bị cáo và...,Law,Other,negative,negative,4,5,0,0


In [14]:
texts = df["text"].tolist()

y_aspect = df[["aspect_1_enc", "aspect_2_enc"]].values
y_sentiment = df[["sentiment_1_enc", "sentiment_2_enc"]].values

texts_temp, texts_test, y_a_temp, y_a_test, y_s_temp, y_s_test = train_test_split(
    texts, y_aspect, y_sentiment, test_size=0.1, random_state=42
)

texts_train, texts_val, y_a_train, y_a_val, y_s_train, y_s_val = train_test_split(
    texts_temp, y_a_temp, y_s_temp, test_size=0.2222, random_state=42
)

In [15]:
class MultiTaskClassifierWithEmbedding(nn.Module):
    def __init__(self, model_name, lstm_hidden_dim, lstm_layers, hidden_dims,
                 num_aspects, num_sentiments, task_specific_dims=None, device='cuda'):
        super().__init__()
        self.device = device
        
        # Tokenizer + embedding model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.embedding_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        self.embedding_model.to(device)
        self.embedding_model.eval()
        for p in self.embedding_model.parameters():
            p.requires_grad = False

        input_dim = self.embedding_model.config.hidden_size

        # BiLSTM
        self.bilstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True
        )

        # Shared MLP
        prev_dim = lstm_hidden_dim * 2
        layers = []
        for hdim in hidden_dims:
            layers += [nn.Linear(prev_dim, hdim), nn.ReLU(), nn.Dropout(0.3), nn.BatchNorm1d(hdim)]
            prev_dim = hdim
        self.shared = nn.Sequential(*layers)

        if task_specific_dims is None:
            task_specific_dims = [prev_dim // 2, prev_dim // 4]

        self.aspect_1_layers = self._create_task_head(prev_dim, task_specific_dims, num_aspects)
        self.aspect_2_layers = self._create_task_head(prev_dim, task_specific_dims, num_aspects)
        self.sentiment_1_layers = self._create_task_head(prev_dim, task_specific_dims, num_sentiments)
        self.sentiment_2_layers = self._create_task_head(prev_dim, task_specific_dims, num_sentiments)

    def _create_task_head(self, input_dim, hidden_dims, output_dim):
        layers = []
        for hdim in hidden_dims:
            layers += [nn.Linear(input_dim, hdim), nn.ReLU(), nn.Dropout(0.2)]
            input_dim = hdim
        layers.append(nn.Linear(input_dim, output_dim))
        return nn.Sequential(*layers)

    def forward(self, text_batch):
        chunk_size = 512
        all_embeddings = []
        seq_lengths = []

        with torch.no_grad():
            for text in text_batch:
                tokens = self.tokenizer.encode(text, truncation=False)
                chunks = []
                for i in range(0, len(tokens), chunk_size):
                    chunk = tokens[i:i+chunk_size]
                    chunk += [self.tokenizer.pad_token_id] * (chunk_size - len(chunk))
                    input_ids = torch.tensor([chunk]).to(self.device)
                    attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
        
                    outputs = self.embedding_model(input_ids=input_ids, attention_mask=attention_mask)
                    token_embs = outputs.last_hidden_state.squeeze(0)  # (T_chunk, D)
                    chunks.append(token_embs)
        
                full_sequence = torch.cat(chunks, dim=0)  # (total_tokens, D)
                all_embeddings.append(full_sequence)
                seq_lengths.append(full_sequence.size(0))

        # Pad to (B, T_max, D)
        padded_input = pad_sequence(all_embeddings, batch_first=True)  # (B, T, D)
        padded_input = padded_input.to(torch.float32)  # Ensure float32
        padded_input = padded_input.to(self.device)

        # Cho vào BiLSTM (không cần pack/unpack)
        lstm_out, _ = self.bilstm(padded_input)  # (B, T, 2H)

        # Mean pooling theo chiều thời gian
        pooled = torch.mean(lstm_out, dim=1)  # (B, 2H)

        # Shared MLP
        shared_features = self.shared(pooled)

        # Multi-task outputs
        a1 = self.aspect_1_layers(shared_features)
        a2 = self.aspect_2_layers(shared_features)
        s1 = self.sentiment_1_layers(shared_features)
        s2 = self.sentiment_2_layers(shared_features)

        return a1, a2, s1, s2

In [16]:
class Trainer:
    def __init__(self, model, train_data, val_data, device='cuda',
                 lr=1e-3, batch_size=64, num_epochs=10, step_size=10, gamma=0.5,
                 class_weights=None):
        self.model = model.to(device)
        self.train_texts, self.train_y_a, self.train_y_s = train_data
        self.val_texts, self.val_y_a, self.val_y_s = val_data
        self.device = device
        self.lr = lr
        self.batch_size = batch_size
        self.num_epochs = num_epochs

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=step_size, gamma=gamma)

        self.ce_a1 = nn.CrossEntropyLoss(weight=class_weights['a1'].to(device) if class_weights and 'a1' in class_weights else None)
        self.ce_a2 = nn.CrossEntropyLoss(weight=class_weights['a2'].to(device) if class_weights and 'a2' in class_weights else None)
        self.ce_s1 = nn.CrossEntropyLoss(weight=class_weights['s1'].to(device) if class_weights and 's1' in class_weights else None)
        self.ce_s2 = nn.CrossEntropyLoss(weight=class_weights['s2'].to(device) if class_weights and 's2' in class_weights else None)

    def get_batches(self, texts, y_aspect, y_sentiment):
        n = len(texts)
        for i in range(0, n, self.batch_size):
            yield (
                texts[i:i+self.batch_size],
                torch.tensor(y_aspect[i:i+self.batch_size].tolist(), dtype=torch.long),
                torch.tensor(y_sentiment[i:i+self.batch_size].tolist(), dtype=torch.long)
            )

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        total_batches = 0

        for text_batch, yb_a, yb_s in self.get_batches(self.train_texts, self.train_y_a, self.train_y_s):
            yb_a = yb_a.to(self.device)
            yb_s = yb_s.to(self.device)

            self.optimizer.zero_grad()
            out_a1, out_a2, out_s1, out_s2 = self.model(text_batch)

            loss_a1 = self.ce_a1(out_a1, yb_a[:, 0])
            loss_a2 = self.ce_a2(out_a2, yb_a[:, 1])
            loss_s1 = self.ce_s1(out_s1, yb_s[:, 0])
            loss_s2 = self.ce_s2(out_s2, yb_s[:, 1])

            loss = loss_a1 + loss_a2 + loss_s1 + loss_s2
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()
            total_batches += 1

        return total_loss / total_batches

    def eval_epoch(self):
        self.model.eval()
        total_loss = 0
        total_batches = 0
        y_true_a1, y_pred_a1 = [], []
        y_true_a2, y_pred_a2 = [], []
        y_true_s1, y_pred_s1 = [], []
        y_true_s2, y_pred_s2 = [], []

        with torch.no_grad():
            for text_batch, yb_a, yb_s in self.get_batches(self.val_texts, self.val_y_a, self.val_y_s):
                yb_a = yb_a.to(self.device)
                yb_s = yb_s.to(self.device)

                out_a1, out_a2, out_s1, out_s2 = self.model(text_batch)

                loss_a1 = self.ce_a1(out_a1, yb_a[:, 0])
                loss_a2 = self.ce_a2(out_a2, yb_a[:, 1])
                loss_s1 = self.ce_s1(out_s1, yb_s[:, 0])
                loss_s2 = self.ce_s2(out_s2, yb_s[:, 1])

                loss = loss_a1 + loss_a2 + loss_s1 + loss_s2
                total_loss += loss.item()
                total_batches += 1

                y_true_a1 += yb_a[:, 0].cpu().tolist()
                y_pred_a1 += out_a1.argmax(dim=1).cpu().tolist()
                y_true_a2 += yb_a[:, 1].cpu().tolist()
                y_pred_a2 += out_a2.argmax(dim=1).cpu().tolist()
                y_true_s1 += yb_s[:, 0].cpu().tolist()
                y_pred_s1 += out_s1.argmax(dim=1).cpu().tolist()
                y_true_s2 += yb_s[:, 1].cpu().tolist()
                y_pred_s2 += out_s2.argmax(dim=1).cpu().tolist()

        f1_a1 = f1_score(y_true_a1, y_pred_a1, average="macro")
        f1_a2 = f1_score(y_true_a2, y_pred_a2, average="macro")
        f1_s1 = f1_score(y_true_s1, y_pred_s1, average="macro")
        f1_s2 = f1_score(y_true_s2, y_pred_s2, average="macro")
        avg_f1 = (f1_a1 + f1_a2 + f1_s1 + f1_s2) / 4

        print(f"F1 Scores — Aspect1: {f1_a1:.4f}, Aspect2: {f1_a2:.4f}, Sent1: {f1_s1:.4f}, Sent2: {f1_s2:.4f} | Avg: {avg_f1:.4f}")
        return total_loss / total_batches

    def train(self):
        for epoch in range(1, self.num_epochs + 1):
            train_loss = self.train_epoch()
            val_loss = self.eval_epoch()
            self.scheduler.step()

            current_lr = self.scheduler.get_last_lr()[0]
            print(f"Epoch {epoch}/{self.num_epochs} | LR: {current_lr:.6f} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

In [17]:
def get_batch(X, y_a, y_s, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y_a[i:i+batch_size], y_s[i:i+batch_size]

In [18]:
num_aspects = len(aspect_encoder.classes_)
num_sentiments = len(sentiment_encoder.classes_)

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
train_data = (texts_train, y_a_train, y_s_train)
val_data = (texts_val, y_a_val, y_s_val)

In [21]:
model = MultiTaskClassifierWithEmbedding(
    model_name="jinaai/jina-embeddings-v3",
    lstm_hidden_dim=256,
    lstm_layers=2,
    hidden_dims=[1024, 512, 256, 128],
    num_aspects=num_aspects,
    num_sentiments=num_sentiments,
    task_specific_dims=[128, 64, 32],
    device=device
)

trainer = Trainer(
    model=model,
    train_data=train_data,
    val_data=val_data,
    device=device,
    lr=4e-3,
    batch_size=32,
    num_epochs=50,
    step_size=5,
    gamma=0.75,
    class_weights={
        'a1': w_aspect,
        'a2': w_aspect,
        's1': w_sentiment,
        's2': w_sentiment
    }
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_xlm_roberta.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py: 0.00B [00:00, ?B/s]

modeling_xlm_roberta.py: 0.00B [00:00, ?B/s]

rotary.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py: 0.00B [00:00, ?B/s]

mlp.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


stochastic_depth.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


mha.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- mlp.py
- stochastic_depth.py
- mha.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- rotary.py
- xlm_padding.py
- block.py
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_lora.py
- modeling_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
2025-07-10 06:51:16.615214: E external/local_x

model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

In [22]:
trainer.train()

F1 Scores — Aspect1: 0.2413, Aspect2: 0.1236, Sent1: 0.4244, Sent2: 0.3376 | Avg: 0.2817
Epoch 1/50 | LR: 0.004000 | Train Loss: 5.6440 | Val Loss: 5.3837
F1 Scores — Aspect1: 0.2743, Aspect2: 0.1377, Sent1: 0.4279, Sent2: 0.3088 | Avg: 0.2872
Epoch 2/50 | LR: 0.004000 | Train Loss: 5.3242 | Val Loss: 5.0758
F1 Scores — Aspect1: 0.2757, Aspect2: 0.2062, Sent1: 0.4238, Sent2: 0.2933 | Avg: 0.2997
Epoch 3/50 | LR: 0.004000 | Train Loss: 5.1660 | Val Loss: 5.0802
F1 Scores — Aspect1: 0.2618, Aspect2: 0.1626, Sent1: 0.4267, Sent2: 0.3379 | Avg: 0.2972
Epoch 4/50 | LR: 0.004000 | Train Loss: 5.0680 | Val Loss: 5.6371
F1 Scores — Aspect1: 0.2437, Aspect2: 0.2116, Sent1: 0.4434, Sent2: 0.3249 | Avg: 0.3059
Epoch 5/50 | LR: 0.003000 | Train Loss: 5.0212 | Val Loss: 5.2212
F1 Scores — Aspect1: 0.3134, Aspect2: 0.1454, Sent1: 0.2552, Sent2: 0.1517 | Avg: 0.2164
Epoch 6/50 | LR: 0.003000 | Train Loss: 4.8628 | Val Loss: 62.7437
F1 Scores — Aspect1: 0.3773, Aspect2: 0.1866, Sent1: 0.4238, Sent2: 0

In [23]:
from sklearn.metrics import classification_report

def evaluate_on_test(model, test_data, batch_size=8, device='cuda'):
    model.eval()
    texts, y_a, y_s = test_data

    y_true_a1, y_pred_a1 = [], []
    y_true_a2, y_pred_a2 = [], []
    y_true_s1, y_pred_s1 = [], []
    y_true_s2, y_pred_s2 = [], []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_y_a = torch.tensor(y_a[i:i+batch_size], dtype=torch.long).to(device)
            batch_y_s = torch.tensor(y_s[i:i+batch_size], dtype=torch.long).to(device)

            # Gọi forward với batch text (model tự tokenize & nhúng)
            out_a1, out_a2, out_s1, out_s2 = model(batch_texts)

            y_true_a1 += batch_y_a[:, 0].cpu().tolist()
            y_pred_a1 += out_a1.argmax(dim=1).cpu().tolist()
            y_true_a2 += batch_y_a[:, 1].cpu().tolist()
            y_pred_a2 += out_a2.argmax(dim=1).cpu().tolist()
            y_true_s1 += batch_y_s[:, 0].cpu().tolist()
            y_pred_s1 += out_s1.argmax(dim=1).cpu().tolist()
            y_true_s2 += batch_y_s[:, 1].cpu().tolist()
            y_pred_s2 += out_s2.argmax(dim=1).cpu().tolist()

    print("\n--- Test Evaluation ---")
    print("Aspect 1:")
    print(classification_report(y_true_a1, y_pred_a1, digits=4))
    print("Aspect 2:")
    print(classification_report(y_true_a2, y_pred_a2, digits=4))
    print("Sentiment 1:")
    print(classification_report(y_true_s1, y_pred_s1, digits=4))
    print("Sentiment 2:")
    print(classification_report(y_true_s2, y_pred_s2, digits=4))


In [24]:
evaluate_on_test(model, (texts_test, y_a_test, y_s_test), batch_size=8, device=device)



--- Test Evaluation ---
Aspect 1:
              precision    recall  f1-score   support

           0     0.7021    0.9429    0.8049        35
           1     0.6429    0.5294    0.5806        34
           2     0.3913    0.7200    0.5070        50
           3     0.6720    0.6462    0.6588       130
           4     0.7118    0.6541    0.6817       185
           5     0.6912    0.4393    0.5371       107
           6     0.8235    0.9825    0.8960        57

    accuracy                         0.6605       598
   macro avg     0.6621    0.7020    0.6666       598
weighted avg     0.6788    0.6605    0.6581       598

Aspect 2:
              precision    recall  f1-score   support

           0     0.2000    0.4444    0.2759         9
           1     0.2273    0.7692    0.3509        13
           2     0.3594    0.6216    0.4554        37
           3     0.4098    0.4464    0.4274        56
           4     0.5200    0.2063    0.2955        63
           5     0.5645    0.3302

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
test_text = """Khoảnh khắc Quang Hùng MasterD "flex" sự học tiếng Hàn cấp tốc để giao lưu với anh Long. 
Fan boy này đi đu idol thành công quá rồi 🤣🤣 
Video: Phạm Thanh Hoa 
Gọi ộp pa thì đội tóc giả nữa a kkk"""


In [26]:
def predict(text):
    model.eval()
    with torch.no_grad():
        out_a1, out_a2, out_s1, out_s2 = model([text])  # Gói trong list vì batch size = 1

        pred_a1 = torch.argmax(out_a1, dim=1).item()
        pred_a2 = torch.argmax(out_a2, dim=1).item()
        pred_s1 = torch.argmax(out_s1, dim=1).item()
        pred_s2 = torch.argmax(out_s2, dim=1).item()

        aspect_1_label = aspect_encoder.inverse_transform([pred_a1])[0]
        aspect_2_label = aspect_encoder.inverse_transform([pred_a2])[0]
        sentiment_1_label = sentiment_encoder.inverse_transform([pred_s1])[0]
        sentiment_2_label = sentiment_encoder.inverse_transform([pred_s2])[0]

        print("=== PREDICTION RESULT ===")
        print(f"Aspect 1:    {aspect_1_label}")
        print(f"Sentiment 1: {sentiment_1_label}")
        print(f"Aspect 2:    {aspect_2_label}")
        print(f"Sentiment 2: {sentiment_2_label}")


In [27]:
predict(test_text)

=== PREDICTION RESULT ===
Aspect 1:    Art
Sentiment 1: positive
Aspect 2:    null
Sentiment 2: null


In [28]:
import torch
import joblib
import os

os.makedirs("saved_model", exist_ok=True)

torch.save({
    'model_state_dict': model.state_dict(),
    'model_name': "jinaai/jina-embeddings-v3",
    'lstm_hidden_dim': 256,
    'lstm_layers': 2,
    'hidden_dims': [1024, 512, 256, 128],
    'task_specific_dims': [128, 64, 32],
    'num_aspects': num_aspects,
    'num_sentiments': num_sentiments
}, "saved_model/model.pt")

# Lưu encoder
joblib.dump(aspect_encoder, "saved_model/aspect_encoder.pkl")
joblib.dump(sentiment_encoder, "saved_model/sentiment_encoder.pkl")


['saved_model/sentiment_encoder.pkl']