In [1]:
!nvidia-smi
!pip show scikit-learn
!pip install scikit-learn==1.5.1

Thu Sep 18 02:20:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P0             28W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [2]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PretrainedConfig, PreTrainedModel, BertTokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import json

import warnings

# Bỏ qua tất cả cảnh báo UserWarning (trong đó có InconsistentVersionWarning cũ)
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

import joblib

nltk.download('punkt')
nltk.download('punkt_tab')

device = "cuda" if torch.cuda.is_available() else "cpu"

2025-09-18 02:21:23.818910: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758162084.022413      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758162084.080446      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
class SLACBERTModelConfig(PretrainedConfig):
    model_type = "bert_model"

    def __init__(self, num_classes=1, pos_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.pos_weight = pos_weight 

class SLACBERTModel(PreTrainedModel):
    config_class = SLACBERTModelConfig

    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, config.num_classes)

        if config.pos_weight is not None:
            pos_weight = torch.tensor(config.pos_weight, dtype=torch.float32)
            self.criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            self.criterion = nn.BCEWithLogitsLoss()
        

    def forward(self, input_ids=None, attention_mask=None, labels=None):      
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [4]:
class RLCCBERTConfig(PretrainedConfig):
    model_type = "bert_with_absa"

    def __init__(self, absa_method=None, num_classes=2, class_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.absa_method = absa_method
        self.num_classes = num_classes
        self.class_weight = class_weight

class RLCCInnerBert(nn.Module):
    def __init__(self, num_classes):
        super(RLCCInnerBert, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)
        

    def forward(self, inputs_embeds=None, attention_mask=None, token_type_ids=None):      
        _, pooled_output = self.bert(inputs_embeds=inputs_embeds, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        x = self.dropout(pooled_output)
        logits = self.fc(x)

        return logits

class RLCCBERTModel(PreTrainedModel):
    config_class = RLCCBERTConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_classes = config.num_classes
        self.absa_method = config.absa_method
        
        self.bert_sent = RLCCInnerBert(self.num_classes)

        if self.absa_method:
            self.absa_fc = nn.Linear(1, 768)
            self.bert_absa = RLCCInnerBert(self.num_classes)

        if config.class_weight is not None:
            class_weight = torch.tensor(config.class_weight, dtype=torch.float32)
            self.criterion = nn.CrossEntropyLoss(weight=class_weight.to(device))
        else:
            self.criterion = nn.CrossEntropyLoss()

        self.init_weights()


    def forward(self, input_ids=None, absa_1=None, absa_2=None, attention_mask=None, token_type_ids=None, labels=None):
        inputs_embeds = self.bert_sent.bert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
        logits_sent = self.bert_sent(inputs_embeds, attention_mask, token_type_ids)
        
        if self.absa_method:
            absa_1 = self.absa_fc(absa_1)
            absa_2 = self.absa_fc(absa_2)
            absa_concat = torch.cat((absa_1, absa_2), dim=1)
            token_type_ids_absa = torch.tensor([0, 1]).unsqueeze(0).repeat(absa_concat.shape[0], 1).to(device)
            
            logits_absa = self.bert_absa(absa_concat, None, token_type_ids_absa)
            
            logits_sent += logits_absa
        
        loss = None
        if labels is not None:
            loss = self.criterion(logits_sent, labels)
    
        return SequenceClassifierOutput(loss=loss, logits=logits_sent)

In [5]:
class XGBoostModel:
    def __init__(self, aspect):
        artifact_filename = f"/kaggle/input/absa/pytorch/default/4/checkpoints/{aspect}_xgb_pipeline.joblib"
        artifacts = joblib.load(artifact_filename)
        
        self.model = artifacts["model"]
        self.vectorizer = artifacts["vectorizer"]
        self.imputer = artifacts["imputer"]
        # rule-based dict
        with open("/kaggle/input/absa/pytorch/default/4/adjective.json", "r") as f:
            self.adj_list = json.load(f)

    def predict(self, sentences):
        final = []

        for sent in sentences:
            # Rule-based trước
            sumkey = []
            for word in sent.split():
                for key, item in self.adj_list.items():
                    if word in item:
                        sumkey.append(int(key))

            if len(sumkey) > 0:
                # Nếu có match rule → dùng rule
                final.append(sum(sumkey) / len(sumkey))
            else:
                # Nếu không có rule → fallback XGBoost
                X_new = self.vectorizer.transform([sent])
                X_new = self.imputer.transform(X_new.toarray())
                pred = self.model.predict(X_new)[0]
                final.append(pred)

        return final


In [6]:
class OverallMode(nn.Module):
    def __init__(self, aspect, slac_id, rlcc_id, absa_method, device):
        super().__init__()
        self.aspect = aspect
        self.device = device
        
        self.aspect_index = {'appearance': 0, 'aroma': 1, 'palate': 2, 'taste': 3}
        self.aspect_idx = self.aspect_index[aspect]  
        
        self.slac_model = SLACBERTModel.from_pretrained(slac_id).to(device)
        self.rlcc_model = RLCCBERTModel.from_pretrained(rlcc_id).to(device)
        
        self.slac_model.eval()
        self.rlcc_model.eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.absa_method = absa_method
        if self.absa_method != "none":
            self.absa_model = XGBoostModel(aspect)
            
    @staticmethod
    def _truncate_seq(tokens, max_length):
        while True:
            total_length = len(tokens)
            if total_length <= max_length:
                break
            tokens.pop()
            
        return tokens
    
    @staticmethod
    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
    
    @staticmethod
    def _split_clean_sentences(text):
        sentences = sent_tokenize(text.lower()) 
        sentences = [re.sub(r'\W+', ' ', s).strip() for s in sentences if len(word_tokenize(s)) > 1]  
        return sentences
    
    
    def _get_aspect_sentences(self, review_sentences):
        total_input_ids = []
        total_input_mask = []
        
        for sent in review_sentences:
            tokens = self.tokenizer.tokenize(sent)

            self._truncate_seq(tokens, 128 - 2) #account for [CLS] [SEP]

            tokens = ["[CLS]"] + tokens + ["[SEP]"]

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            padding = [0] * (128 - len(input_ids))
            
            input_ids += padding
            input_mask += padding
            
            total_input_ids.append(torch.tensor([input_ids], dtype=torch.long).to(self.device))
            total_input_mask.append(torch.tensor([input_mask], dtype=torch.long).to(self.device))
            
        input_ids = torch.cat(total_input_ids, dim=0).to(self.device)
        attention_masks = torch.cat(total_input_mask, dim=0).to(self.device)
        
        with torch.no_grad():
            aspect_logits = self.slac_model(input_ids, attention_masks).logits
            probs = torch.sigmoid(aspect_logits.clone().detach())
            
            aspect_sentences = []
            for prob, sent in zip(probs, review_sentences):
                if prob > 0.5:
                    aspect_sentences.append(sent)
                
            return aspect_sentences
    
    def _get_review_comparative(self, review_1, review_2, absa_1, absa_2):
        tokens_a = self.tokenizer.tokenize(review_1)
        tokens_b = self.tokenizer.tokenize(review_2)

        self._truncate_seq_pair(tokens_a, tokens_b, 128 - 3) #account for [CLS] [SEP] [SEP]

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding = [0] * (128 - len(input_ids))
        
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        input_ids = torch.tensor([input_ids], dtype=torch.long).to(self.device)
        attention_mask = torch.tensor([input_mask], dtype=torch.long).to(self.device)
        segment_ids = torch.tensor([segment_ids], dtype=torch.long).to(self.device)

        if absa_1 is not None and absa_2 is not None:
            absa_1 = torch.tensor([absa_1], dtype=torch.float32).unsqueeze(1).unsqueeze(1).to(self.device)
            absa_2 = torch.tensor([absa_2], dtype=torch.float32).unsqueeze(1).unsqueeze(1).to(self.device)
            
        with torch.no_grad():
            outputs = self.rlcc_model(input_ids, absa_1, absa_2, attention_mask, segment_ids).logits
            softmax = torch.nn.Softmax(dim=-1)
            probs = softmax(outputs)
            return probs
        
    def forward(self, review_1, review_2):
        
        review_1_sent = self._split_clean_sentences(review_1)
        review_2_sent = self._split_clean_sentences(review_2)
        
        if len(review_1_sent) == 0 or len(review_2_sent) == 0:
            return 2
        
        review_1_aspect_sentences = self._get_aspect_sentences(review_1_sent)
        review_2_aspect_sentences = self._get_aspect_sentences(review_2_sent)
        
        if len(review_1_aspect_sentences) == 0 or len(review_2_aspect_sentences) == 0:
            return 2
        
        absa_1 = None
        absa_2 = None
        if self.absa_method != None:
            absa_1_list = self.absa_model.predict(review_1_aspect_sentences)
            absa_2_list = self.absa_model.predict(review_2_aspect_sentences)
            
            if self.absa_method == 'min':
                absa_1 = min(absa_1_list)
                absa_2 = min(absa_2_list)
            elif self.absa_method == 'max':
                absa_1 = max(absa_1_list)
                absa_2 = max(absa_2_list)
            elif self.absa_method == 'avg':
                absa_1 = sum(absa_1_list) / len(absa_1_list)
                absa_2 = sum(absa_2_list) / len(absa_2_list) 

        review_1_tuple = ", ".join(review_1_aspect_sentences)
        review_2_tuple = ", ".join(review_2_aspect_sentences)
        
        result = self._get_review_comparative(review_1_tuple, review_2_tuple, absa_1, absa_2)
        return int(torch.argmax(result, dim=1)[0]) - 1

In [7]:
# Define labels
labels = [-1, 0, 1, 2]
positive_labels = [-1, 0, 1]  # Positive labels: A<B, A=B, A>B
negative_label = 2            # Negative label: No comparison

from collections import defaultdict
def compute_metrics(true_labels, pred_labels):
        # Initialize dictionaries to count
    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)
    
    # Process each sample
    for true, pred in zip(true_labels, pred_labels):
        if true == pred:
            # True Positive: Only count for positive labels
            if true in positive_labels:
                tp[true] += 1
        else:
            # True is positive, Prediction is null (missing)
            if true in positive_labels and pred == negative_label:
                fn[true] += 1  # Only increase FN
            
            # True is null, Prediction is positive (excess)
            elif true == negative_label and pred in positive_labels:
                fp[pred] += 1  # Only increase FP
            
            # Confusion between positive labels
            elif true in positive_labels and pred in positive_labels:
                fn[true] += 1  # Missing the correct label
                fp[pred] += 1  # Excess of the predicted label
    
    return tp, fp, fn

In [8]:
from tqdm import tqdm

# Define a function to run evaluation for all aspects
def overall_evaluate_all_aspects(sampling_method, absa_method, eval_dataset):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    
    # Dictionary to store predictions for each aspect
    aspect_predictions = {}
    
    # Run each aspect model and collect predictions
    for aspect in aspects:
        rlcc_sampling_method = sampling_method
        if sampling_method == "class_weight":
            rlcc_sampling_method = "class-weight"
        model = OverallMode(
            aspect, 
            f"trungpq/slac-new-{aspect}-{sampling_method}",  
            f"trungpq/rlcc-new-{aspect}-{rlcc_sampling_method}-absa-{absa_method}", 
            absa_method, 
            device
        )
        
        predictions = []
        for sample in tqdm(eval_dataset):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
        
        # Store predictions for this aspect
        aspect_predictions[aspect] = predictions
        del model
    
    # Combine all predictions and true labels into flat lists
    all_predictions_flat = []
    all_true_labels_flat = []
    
    for i in range(len(eval_dataset)):
        for aspect in aspects:
            all_predictions_flat.append(aspect_predictions[aspect][i])
            all_true_labels_flat.append(eval_dataset[i][aspect])
    
    # Calculate TP, FP, FN
    tp, fp, fn = compute_metrics(all_true_labels_flat, all_predictions_flat)
    
    # Calculate precision, recall, F1 for each class
    class_metrics = {}
    for label in positive_labels:
        # Skip if no instances for this class
        if tp[label] + fp[label] == 0:
            precision = 0
        else:
            precision = tp[label] / (tp[label] + fp[label])
            
        if tp[label] + fn[label] == 0:
            recall = 0
        else:
            recall = tp[label] / (tp[label] + fn[label])
            
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        class_metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp[label],
            'fp': fp[label],
            'fn': fn[label]
        }
    
    # Calculate macro metrics (average over classes)
    macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
    macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
    macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
    
    # Calculate micro metrics (aggregate TP, FP, FN)
    total_tp = sum(tp[label] for label in positive_labels)
    total_fp = sum(fp[label] for label in positive_labels)
    total_fn = sum(fn[label] for label in positive_labels)
    
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
    
    # Return the combined results
    return {
        'method': f"{sampling_method}_absa-{absa_method}",
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'micro_precision': micro_precision,
        'macro_precision': macro_precision,
        'micro_recall': micro_recall,
        'macro_recall': macro_recall
    }

In [9]:
# Define a function to evaluate each aspect separately
def aspect_wise_evaluate(sampling_method, absa_method, eval_dataset):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    aspect_results = {}
    
    for aspect in aspects:
        print(f"Evaluating {aspect} model for aspect-wise metrics...")
        rlcc_sampling_method = sampling_method
        if sampling_method == "class_weight":
            rlcc_sampling_method = "class-weight"
        model = OverallMode(
            aspect, 
            f"trungpq/slac-new-{aspect}-{sampling_method}", 
            f"trungpq/rlcc-new-{aspect}-{rlcc_sampling_method}-absa-{absa_method}", 
            absa_method, 
            device
        )
        
        predictions = []
        true_labels = []
        
        for i, sample in enumerate(tqdm(eval_dataset)):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
            true_labels.append(sample[aspect])
        
        # Calculate TP, FP, FN for this aspect
        tp, fp, fn = compute_metrics(true_labels, predictions)
        
        # Calculate precision, recall, F1 for each class
        class_metrics = {}
        for label in positive_labels:
            # Skip if no instances for this class
            if tp[label] + fp[label] == 0:
                precision = 0
            else:
                precision = tp[label] / (tp[label] + fp[label])
                
            if tp[label] + fn[label] == 0:
                recall = 0
            else:
                recall = tp[label] / (tp[label] + fn[label])
                
            if precision + recall == 0:
                f1 = 0
            else:
                f1 = 2 * precision * recall / (precision + recall)
                
            class_metrics[label] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'tp': tp[label],
                'fp': fp[label],
                'fn': fn[label]
            }
        
        # Calculate macro metrics (average over classes)
        macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
        macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
        macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
        
        # Calculate micro metrics (aggregate TP, FP, FN)
        total_tp = sum(tp[label] for label in positive_labels)
        total_fp = sum(fp[label] for label in positive_labels)
        total_fn = sum(fn[label] for label in positive_labels)
        
        micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
        
        aspect_results[aspect] = {
            'micro_precision': micro_precision,
            'micro_recall': micro_recall,
            'micro_f1': micro_f1,
            'macro_precision': macro_precision,
            'macro_recall': macro_recall,
            'macro_f1': macro_f1,
            'class_metrics': class_metrics
        }
        
        print(f"{aspect} - Micro P/R/F1: {micro_precision:.4f}/{micro_recall:.4f}/{micro_f1:.4f}")
        print(f"{aspect} - Macro P/R/F1: {macro_precision:.4f}/{macro_recall:.4f}/{macro_f1:.4f}")
        
        del model
    
    return aspect_results

In [10]:
import datasets

eval_dataset = datasets.load_dataset("lengocquangLAB/beer-com-reviews", split="test")

for sampling_method in ['upsample_replacement', "class_weight"]:
    for absa_method in ['None', 'min', 'max', 'avg']: 
        print("-"*50)
        print(f"Evaluating with sampling method: {sampling_method}, ABSA method: {absa_method}")
        results = overall_evaluate_all_aspects(sampling_method, absa_method, eval_dataset)
        print(results)
        print()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/977k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/133k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3320 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/420 [00:00<?, ? examples/s]

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: None


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

100%|██████████| 410/410 [00:33<00:00, 12.30it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 12.82it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:30<00:00, 13.51it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.95it/s]


{'method': 'upsample_replacement_absa-None', 'micro_f1': 0.571150097465887, 'macro_f1': 0.5753464645672945, 'micro_precision': 0.591919191919192, 'macro_precision': 0.6079934736930371, 'micro_recall': 0.551789077212806, 'macro_recall': 0.5493967089019631}

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: min


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.80it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.12it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 12.87it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:37<00:00, 10.97it/s]


{'method': 'upsample_replacement_absa-min', 'micro_f1': 0.5838206627680312, 'macro_f1': 0.5855834209573026, 'micro_precision': 0.6050505050505051, 'macro_precision': 0.6192353361713692, 'micro_recall': 0.564030131826742, 'macro_recall': 0.5611361760389567}

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: max


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.82it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.19it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:32<00:00, 12.77it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:37<00:00, 10.90it/s]


{'method': 'upsample_replacement_absa-max', 'micro_f1': 0.5526315789473685, 'macro_f1': 0.5573314236991226, 'micro_precision': 0.5727272727272728, 'macro_precision': 0.5790152379783432, 'micro_recall': 0.5338983050847458, 'macro_recall': 0.5378131024364339}

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: avg


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.72it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.21it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.02it/s]


config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:37<00:00, 10.91it/s]


{'method': 'upsample_replacement_absa-avg', 'micro_f1': 0.5760233918128655, 'macro_f1': 0.580269559728597, 'micro_precision': 0.5969696969696969, 'macro_precision': 0.6061523792267659, 'micro_recall': 0.556497175141243, 'macro_recall': 0.5576734620540682}

--------------------------------------------------
Evaluating with sampling method: class_weight, ABSA method: None


config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:32<00:00, 12.72it/s]


config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.07it/s]


config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:30<00:00, 13.45it/s]


config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.06it/s]


{'method': 'class_weight_absa-None', 'micro_f1': 0.5494830132939439, 'macro_f1': 0.5535093426700325, 'micro_precision': 0.5758513931888545, 'macro_precision': 0.5888399220672541, 'micro_recall': 0.5254237288135594, 'macro_recall': 0.5262417166508143}

--------------------------------------------------
Evaluating with sampling method: class_weight, ABSA method: min


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:35<00:00, 11.69it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.28it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 12.96it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:36<00:00, 11.26it/s]


{'method': 'class_weight_absa-min', 'micro_f1': 0.5386509108813392, 'macro_f1': 0.5439472320757628, 'micro_precision': 0.564499484004128, 'macro_precision': 0.5684702021764918, 'micro_recall': 0.5150659133709982, 'macro_recall': 0.5245138691869594}

--------------------------------------------------
Evaluating with sampling method: class_weight, ABSA method: max


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.92it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.39it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.11it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:36<00:00, 11.22it/s]


{'method': 'class_weight_absa-max', 'micro_f1': 0.5603151157065485, 'macro_f1': 0.5651977080558905, 'micro_precision': 0.587203302373581, 'macro_precision': 0.5915269411743166, 'micro_recall': 0.5357815442561206, 'macro_recall': 0.5422757187185533}

--------------------------------------------------
Evaluating with sampling method: class_weight, ABSA method: avg


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.93it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:33<00:00, 12.34it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.03it/s]


config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:36<00:00, 11.20it/s]


{'method': 'class_weight_absa-avg', 'micro_f1': 0.5534219596258001, 'macro_f1': 0.5588099352775623, 'micro_precision': 0.5799793601651186, 'macro_precision': 0.585762936121359, 'micro_recall': 0.5291902071563088, 'macro_recall': 0.5379517601507944}



In [11]:
# Run aspect-wise evaluation
print("="*70)
print("ASPECT-WISE EVALUATION")
print("="*70)

for sampling_method in ['upsample_replacement', "class_weight"]:
    for absa_method in ['None', 'min', 'max', 'avg']: 
        print("-"*50)
        print(f"Aspect-wise evaluation with sampling method: {sampling_method}, ABSA method: {absa_method}")
        aspect_results = aspect_wise_evaluate(sampling_method, absa_method, eval_dataset)
        
        # Print summary for each aspect
        for aspect, metrics in aspect_results.items():
            print(f"\n{aspect.upper()} Results:")
            print(f"  Micro F1: {metrics['micro_f1']:.4f}")
            print(f"  Macro F1: {metrics['macro_f1']:.4f}")
        print()

ASPECT-WISE EVALUATION
--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: None
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.79it/s]


appearance - Micro P/R/F1: 0.5977/0.5911/0.5944
appearance - Macro P/R/F1: 0.6359/0.5833/0.5991
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 13.08it/s]


aroma - Micro P/R/F1: 0.6419/0.5823/0.6106
aroma - Macro P/R/F1: 0.6479/0.5840/0.6122
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 13.74it/s]


palate - Micro P/R/F1: 0.5139/0.4088/0.4554
palate - Macro P/R/F1: 0.5535/0.3918/0.4433
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.37it/s]


taste - Micro P/R/F1: 0.5890/0.5733/0.5811
taste - Macro P/R/F1: 0.5926/0.5805/0.5864

APPEARANCE Results:
  Micro F1: 0.5944
  Macro F1: 0.5991

AROMA Results:
  Micro F1: 0.6106
  Macro F1: 0.6122

PALATE Results:
  Micro F1: 0.4554
  Macro F1: 0.4433

TASTE Results:
  Micro F1: 0.5811
  Macro F1: 0.5864

--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: min
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.09it/s]


appearance - Micro P/R/F1: 0.6128/0.6059/0.6093
appearance - Macro P/R/F1: 0.6358/0.5998/0.6118
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.49it/s]


aroma - Micro P/R/F1: 0.6093/0.5527/0.5796
aroma - Macro P/R/F1: 0.6143/0.5357/0.5672
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.23it/s]


palate - Micro P/R/F1: 0.5625/0.4475/0.4985
palate - Macro P/R/F1: 0.6456/0.4279/0.4823
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:36<00:00, 11.14it/s]


taste - Micro P/R/F1: 0.6137/0.5973/0.6054
taste - Macro P/R/F1: 0.6181/0.6071/0.6098

APPEARANCE Results:
  Micro F1: 0.6093
  Macro F1: 0.6118

AROMA Results:
  Micro F1: 0.5796
  Macro F1: 0.5672

PALATE Results:
  Micro F1: 0.4985
  Macro F1: 0.4823

TASTE Results:
  Micro F1: 0.6054
  Macro F1: 0.6098

--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: max
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.12it/s]


appearance - Micro P/R/F1: 0.5752/0.5688/0.5720
appearance - Macro P/R/F1: 0.6114/0.5643/0.5786
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.49it/s]


aroma - Micro P/R/F1: 0.6047/0.5485/0.5752
aroma - Macro P/R/F1: 0.6145/0.5459/0.5706
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.25it/s]


palate - Micro P/R/F1: 0.5625/0.4475/0.4985
palate - Macro P/R/F1: 0.6035/0.4318/0.4859
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:36<00:00, 11.34it/s]


taste - Micro P/R/F1: 0.5562/0.5413/0.5486
taste - Macro P/R/F1: 0.5542/0.5586/0.5505

APPEARANCE Results:
  Micro F1: 0.5720
  Macro F1: 0.5786

AROMA Results:
  Micro F1: 0.5752
  Macro F1: 0.5706

PALATE Results:
  Micro F1: 0.4985
  Macro F1: 0.4859

TASTE Results:
  Micro F1: 0.5486
  Macro F1: 0.5505

--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: avg
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.21it/s]


appearance - Micro P/R/F1: 0.6090/0.6022/0.6056
appearance - Macro P/R/F1: 0.6421/0.5968/0.6119
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.66it/s]


aroma - Micro P/R/F1: 0.6047/0.5485/0.5752
aroma - Macro P/R/F1: 0.6172/0.5477/0.5783
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.30it/s]


palate - Micro P/R/F1: 0.5764/0.4586/0.5108
palate - Macro P/R/F1: 0.6123/0.4402/0.4927
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:35<00:00, 11.40it/s]


taste - Micro P/R/F1: 0.5918/0.5760/0.5838
taste - Macro P/R/F1: 0.5913/0.5892/0.5869

APPEARANCE Results:
  Micro F1: 0.6056
  Macro F1: 0.6119

AROMA Results:
  Micro F1: 0.5752
  Macro F1: 0.5783

PALATE Results:
  Micro F1: 0.5108
  Macro F1: 0.4927

TASTE Results:
  Micro F1: 0.5838
  Macro F1: 0.5869

--------------------------------------------------
Aspect-wise evaluation with sampling method: class_weight, ABSA method: None
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 13.10it/s]


appearance - Micro P/R/F1: 0.5871/0.5762/0.5816
appearance - Macro P/R/F1: 0.6166/0.5745/0.5899
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.30it/s]


aroma - Micro P/R/F1: 0.6402/0.5781/0.6075
aroma - Macro P/R/F1: 0.6414/0.5800/0.6081
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 13.92it/s]


palate - Micro P/R/F1: 0.5435/0.4144/0.4702
palate - Macro P/R/F1: 0.5636/0.3937/0.4486
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.67it/s]


taste - Micro P/R/F1: 0.5411/0.5093/0.5247
taste - Macro P/R/F1: 0.5521/0.5194/0.5280

APPEARANCE Results:
  Micro F1: 0.5816
  Macro F1: 0.5899

AROMA Results:
  Micro F1: 0.6075
  Macro F1: 0.6081

PALATE Results:
  Micro F1: 0.4702
  Macro F1: 0.4486

TASTE Results:
  Micro F1: 0.5247
  Macro F1: 0.5280

--------------------------------------------------
Aspect-wise evaluation with sampling method: class_weight, ABSA method: min
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.42it/s]


appearance - Micro P/R/F1: 0.5795/0.5688/0.5741
appearance - Macro P/R/F1: 0.6146/0.5716/0.5837
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.81it/s]


aroma - Micro P/R/F1: 0.6168/0.5570/0.5854
aroma - Macro P/R/F1: 0.6145/0.5649/0.5863
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.38it/s]


palate - Micro P/R/F1: 0.5072/0.3867/0.4389
palate - Macro P/R/F1: 0.5334/0.3782/0.4354
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:35<00:00, 11.50it/s]


taste - Micro P/R/F1: 0.5439/0.5120/0.5275
taste - Macro P/R/F1: 0.5485/0.5309/0.5265

APPEARANCE Results:
  Micro F1: 0.5741
  Macro F1: 0.5837

AROMA Results:
  Micro F1: 0.5854
  Macro F1: 0.5863

PALATE Results:
  Micro F1: 0.4389
  Macro F1: 0.4354

TASTE Results:
  Micro F1: 0.5275
  Macro F1: 0.5265

--------------------------------------------------
Aspect-wise evaluation with sampling method: class_weight, ABSA method: max
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.32it/s]


appearance - Micro P/R/F1: 0.5947/0.5836/0.5891
appearance - Macro P/R/F1: 0.6065/0.5963/0.5964
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.75it/s]


aroma - Micro P/R/F1: 0.6262/0.5654/0.5942
aroma - Macro P/R/F1: 0.6263/0.5640/0.5935
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.38it/s]


palate - Micro P/R/F1: 0.5072/0.3867/0.4389
palate - Macro P/R/F1: 0.5195/0.3796/0.4320
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:35<00:00, 11.70it/s]


taste - Micro P/R/F1: 0.5892/0.5547/0.5714
taste - Macro P/R/F1: 0.5917/0.5625/0.5744

APPEARANCE Results:
  Micro F1: 0.5891
  Macro F1: 0.5964

AROMA Results:
  Micro F1: 0.5942
  Macro F1: 0.5935

PALATE Results:
  Micro F1: 0.4389
  Macro F1: 0.4320

TASTE Results:
  Micro F1: 0.5714
  Macro F1: 0.5744

--------------------------------------------------
Aspect-wise evaluation with sampling method: class_weight, ABSA method: avg
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:33<00:00, 12.39it/s]


appearance - Micro P/R/F1: 0.6061/0.5948/0.6004
appearance - Macro P/R/F1: 0.6337/0.5943/0.6073
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.78it/s]


aroma - Micro P/R/F1: 0.6168/0.5570/0.5854
aroma - Macro P/R/F1: 0.6173/0.5833/0.5893
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.48it/s]


palate - Micro P/R/F1: 0.5145/0.3923/0.4451
palate - Macro P/R/F1: 0.5310/0.3880/0.4441
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:35<00:00, 11.57it/s]

taste - Micro P/R/F1: 0.5637/0.5307/0.5467
taste - Macro P/R/F1: 0.5739/0.5434/0.5495

APPEARANCE Results:
  Micro F1: 0.6004
  Macro F1: 0.6073

AROMA Results:
  Micro F1: 0.5854
  Macro F1: 0.5893

PALATE Results:
  Micro F1: 0.4451
  Macro F1: 0.4441

TASTE Results:
  Micro F1: 0.5467
  Macro F1: 0.5495




