In [1]:
!nvidia-smi
!pip show scikit-learn
!pip install scikit-learn==1.5.1

Thu Nov  6 15:51:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [2]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PretrainedConfig, PreTrainedModel, BertTokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import json

import warnings

# Bỏ qua tất cả cảnh báo UserWarning (trong đó có InconsistentVersionWarning cũ)
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

import joblib

nltk.download('punkt')
nltk.download('punkt_tab')

device = "cuda" if torch.cuda.is_available() else "cpu"

2025-11-06 15:51:53.721189: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762444313.912635      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762444313.974345      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
class SLACBERTModelConfig(PretrainedConfig):
    model_type = "bert_model"

    def __init__(self, num_classes=1, pos_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.pos_weight = pos_weight 

class SLACBERTModel(PreTrainedModel):
    config_class = SLACBERTModelConfig

    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, config.num_classes)

        if config.pos_weight is not None:
            pos_weight = torch.tensor(config.pos_weight, dtype=torch.float32)
            self.criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        else:
            self.criterion = nn.BCEWithLogitsLoss()
        

    def forward(self, input_ids=None, attention_mask=None, labels=None):      
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [4]:
class RLCCBERTConfig(PretrainedConfig):
    model_type = "bert_with_absa"

    def __init__(self, absa_method=None, num_classes=2, class_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.absa_method = absa_method
        self.num_classes = num_classes
        self.class_weight = class_weight

class RLCCInnerBert(nn.Module):
    def __init__(self, num_classes):
        super(RLCCInnerBert, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, num_classes)
        

    def forward(self, inputs_embeds=None, attention_mask=None, token_type_ids=None):      
        _, pooled_output = self.bert(inputs_embeds=inputs_embeds, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        x = self.dropout(pooled_output)
        logits = self.fc(x)

        return logits

class RLCCBERTModel(PreTrainedModel):
    config_class = RLCCBERTConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_classes = config.num_classes
        self.absa_method = config.absa_method
        
        self.bert_sent = RLCCInnerBert(self.num_classes)

        if self.absa_method:
            self.absa_fc = nn.Linear(1, 768)
            self.bert_absa = RLCCInnerBert(self.num_classes)

        if config.class_weight is not None:
            class_weight = torch.tensor(config.class_weight, dtype=torch.float32)
            self.criterion = nn.CrossEntropyLoss(weight=class_weight.to(device))
        else:
            self.criterion = nn.CrossEntropyLoss()

        self.init_weights()


    def forward(self, input_ids=None, absa_1=None, absa_2=None, attention_mask=None, token_type_ids=None, labels=None):
        inputs_embeds = self.bert_sent.bert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
        logits_sent = self.bert_sent(inputs_embeds, attention_mask, token_type_ids)
        
        if self.absa_method:
            absa_1 = self.absa_fc(absa_1)
            absa_2 = self.absa_fc(absa_2)
            absa_concat = torch.cat((absa_1, absa_2), dim=1)
            token_type_ids_absa = torch.tensor([0, 1]).unsqueeze(0).repeat(absa_concat.shape[0], 1).to(device)
            
            logits_absa = self.bert_absa(absa_concat, None, token_type_ids_absa)
            
            logits_sent += logits_absa
        
        loss = None
        if labels is not None:
            loss = self.criterion(logits_sent, labels)
    
        return SequenceClassifierOutput(loss=loss, logits=logits_sent)

In [5]:
class RLCCBERTConfigNoSum(PretrainedConfig):
    model_type = "bert_with_absa_no_sum_dis"

    def __init__(self, absa_method=None, num_classes=2, class_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.absa_method = absa_method
        self.num_classes = num_classes
        self.class_weight = class_weight

class RLCCBERTModelNoSum(PreTrainedModel):
    config_class = RLCCBERTConfigNoSum

    def __init__(self, config):
        super().__init__(config)
        self.num_classes = config.num_classes
        self.absa_method = config.absa_method
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        
        if self.absa_method:
            self.absa_fc = nn.Linear(1, 768)

        input_dim = 768 + (768 * 2 if self.absa_method else 0)
        self.classifier = nn.Linear(input_dim, self.num_classes)
        
        self.criterion = nn.CrossEntropyLoss()
        
        self.init_weights()


    def forward(self, input_ids=None, absa_1=None, absa_2=None, attention_mask=None, token_type_ids=None, labels=None):
        inputs_embeds = self.bert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
        _, pooled_output = self.bert(inputs_embeds=inputs_embeds, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        if self.absa_method:
            absa_1 = self.absa_fc(absa_1).squeeze(1)
            absa_2 = self.absa_fc(absa_2).squeeze(1)
            concat_features = torch.cat((pooled_output, absa_1, absa_2), dim=1)
        else:
            concat_features = pooled_output

        logits = self.classifier(concat_features)
        
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)
    
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [6]:
class RLCCBERTConfigNoSe(PretrainedConfig):
    model_type = "bert_with_absa_no_se_cl"

    def __init__(self, absa_method=None, num_classes=2, class_weight=None, **kwargs):
        super().__init__(**kwargs)
        self.absa_method = absa_method
        self.num_classes = num_classes
        self.class_weight = class_weight
        
class RLCCBERTModelNoSe(PreTrainedModel):
    config_class = RLCCBERTConfigNoSe

    def __init__(self, config):
        super().__init__(config)
        self.num_classes = config.num_classes
        self.absa_method = config.absa_method

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)

        if self.absa_method:
            self.absa_fc = nn.Linear(1, 768)

        # chỉ còn 768 vì pooled_output của BERT là 768-dim
        self.classifier = nn.Linear(768, self.num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        absa_1=None,
        absa_2=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
    ):
        # 1. Convert ABSA values → embeddings
        absa_1 = self.absa_fc(absa_1).squeeze(1)  # [B, 768]
        absa_2 = self.absa_fc(absa_2).squeeze(1)  # [B, 768]

        # 2. Stack to make sequence of 2 tokens
        absa_seq = torch.stack([absa_1, absa_2], dim=1)  # [B, 2, 768]

        # 3. Generate masks
        batch_size = absa_seq.size(0)
        attention_mask = torch.ones((batch_size, 2), dtype=torch.long).to(absa_seq.device)
        token_type_ids = torch.tensor([0, 1]).unsqueeze(0).repeat(batch_size, 1).to(absa_seq.device)

        # 4. Forward through BERT
        _, pooled_output = self.bert(
            inputs_embeds=absa_seq,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False,
        )

        # 5. Classify
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # 6. Loss
        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [7]:
class XGBoostModel:
    def __init__(self, aspect):
        artifact_filename = f"/kaggle/input/absa/pytorch/default/4/checkpoints/{aspect}_xgb_pipeline.joblib"
        artifacts = joblib.load(artifact_filename)
        
        self.model = artifacts["model"]
        self.vectorizer = artifacts["vectorizer"]
        self.imputer = artifacts["imputer"]
        # rule-based dict
        with open("/kaggle/input/absa/pytorch/default/4/adjective.json", "r") as f:
            self.adj_list = json.load(f)

    def predict(self, sentences):
        final = []

        for sent in sentences:
            # Rule-based trước
            sumkey = []
            for word in sent.split():
                for key, item in self.adj_list.items():
                    if word in item:
                        sumkey.append(int(key))

            if len(sumkey) > 0:
                # Nếu có match rule → dùng rule
                final.append(sum(sumkey) / len(sumkey))
            else:
                # Nếu không có rule → fallback XGBoost
                X_new = self.vectorizer.transform([sent])
                X_new = self.imputer.transform(X_new.toarray())
                pred = self.model.predict(X_new)[0]
                final.append(pred)

        return final


In [8]:
class OverallMode(nn.Module):
    def __init__(self, aspect, sampling_method, absa_method, ablation, device):
        super().__init__()
        self.aspect = aspect
        self.device = device
        
        self.aspect_index = {'appearance': 0, 'aroma': 1, 'palate': 2, 'taste': 3}
        self.aspect_idx = self.aspect_index[aspect]  
            
        if ablation == "sum_distribution":
            slac_id = f"trungpq/slac-new-{aspect}-{sampling_method}"
            rlcc_id = f"trungpq/rlcc-new-{aspect}-{sampling_method}-absa-{absa_method}-{ablation}"
            self.slac_model = SLACBERTModel.from_pretrained(slac_id).to(device)
            self.rlcc_model = RLCCBERTModelNoSum.from_pretrained(rlcc_id).to(device)

        elif ablation == "imbalance_handling":
            slac_id = f"trungpq/slac-new-{aspect}-{sampling_method}-{ablation}"
            rlcc_id = f"trungpq/rlcc-new-{aspect}-{sampling_method}-absa-{absa_method}-{ablation}"
            self.slac_model = SLACBERTModel.from_pretrained(slac_id).to(device)
            self.rlcc_model = RLCCBERTModel.from_pretrained(rlcc_id).to(device)
            
        elif ablation == "score_prediction":
            slac_id = f"trungpq/slac-new-{aspect}-{sampling_method}"
            rlcc_id = f"trungpq/rlcc-new-{aspect}-{sampling_method}-absa-{absa_method}-{ablation}"
            self.slac_model = SLACBERTModel.from_pretrained(slac_id).to(device)
            self.rlcc_model = RLCCBERTModel.from_pretrained(rlcc_id).to(device)

        elif ablation == "semantic_based":
            slac_id = f"trungpq/slac-new-{aspect}-{sampling_method}"
            rlcc_id = f"trungpq/rlcc-new-{aspect}-{sampling_method}-absa-{absa_method}-{ablation}"
            self.slac_model = SLACBERTModel.from_pretrained(slac_id).to(device)
            self.rlcc_model = RLCCBERTModelNoSe.from_pretrained(rlcc_id).to(device)
        
        self.slac_model.eval()
        self.rlcc_model.eval()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.absa_method = absa_method
        if self.absa_method != "none":
            self.absa_model = XGBoostModel(aspect)
            
    @staticmethod
    def _truncate_seq(tokens, max_length):
        while True:
            total_length = len(tokens)
            if total_length <= max_length:
                break
            tokens.pop()
            
        return tokens
    
    @staticmethod
    def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()
    
    @staticmethod
    def _split_clean_sentences(text):
        sentences = sent_tokenize(text.lower()) 
        sentences = [re.sub(r'\W+', ' ', s).strip() for s in sentences if len(word_tokenize(s)) > 1]  
        return sentences
    
    
    def _get_aspect_sentences(self, review_sentences):
        total_input_ids = []
        total_input_mask = []
        
        for sent in review_sentences:
            tokens = self.tokenizer.tokenize(sent)

            self._truncate_seq(tokens, 128 - 2) #account for [CLS] [SEP]

            tokens = ["[CLS]"] + tokens + ["[SEP]"]

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            padding = [0] * (128 - len(input_ids))
            
            input_ids += padding
            input_mask += padding
            
            total_input_ids.append(torch.tensor([input_ids], dtype=torch.long).to(self.device))
            total_input_mask.append(torch.tensor([input_mask], dtype=torch.long).to(self.device))
            
        input_ids = torch.cat(total_input_ids, dim=0).to(self.device)
        attention_masks = torch.cat(total_input_mask, dim=0).to(self.device)
        
        with torch.no_grad():
            aspect_logits = self.slac_model(input_ids, attention_masks).logits
            probs = torch.sigmoid(aspect_logits.clone().detach())
            
            aspect_sentences = []
            for prob, sent in zip(probs, review_sentences):
                if prob > 0.5:
                    aspect_sentences.append(sent)
                
            return aspect_sentences
    
    def _get_review_comparative(self, review_1, review_2, absa_1, absa_2):
        tokens_a = self.tokenizer.tokenize(review_1)
        tokens_b = self.tokenizer.tokenize(review_2)

        self._truncate_seq_pair(tokens_a, tokens_b, 128 - 3) #account for [CLS] [SEP] [SEP]

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding = [0] * (128 - len(input_ids))
        
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        input_ids = torch.tensor([input_ids], dtype=torch.long).to(self.device)
        attention_mask = torch.tensor([input_mask], dtype=torch.long).to(self.device)
        segment_ids = torch.tensor([segment_ids], dtype=torch.long).to(self.device)

        if absa_1 is not None and absa_2 is not None:
            absa_1 = torch.tensor([absa_1], dtype=torch.float32).unsqueeze(1).unsqueeze(1).to(self.device)
            absa_2 = torch.tensor([absa_2], dtype=torch.float32).unsqueeze(1).unsqueeze(1).to(self.device)
            
        with torch.no_grad():
            outputs = self.rlcc_model(input_ids, absa_1, absa_2, attention_mask, segment_ids).logits
            softmax = torch.nn.Softmax(dim=-1)
            probs = softmax(outputs)
            return probs
        
    def forward(self, review_1, review_2):
        
        review_1_sent = self._split_clean_sentences(review_1)
        review_2_sent = self._split_clean_sentences(review_2)
        
        if len(review_1_sent) == 0 or len(review_2_sent) == 0:
            return 2
        
        review_1_aspect_sentences = self._get_aspect_sentences(review_1_sent)
        review_2_aspect_sentences = self._get_aspect_sentences(review_2_sent)
        
        if len(review_1_aspect_sentences) == 0 or len(review_2_aspect_sentences) == 0:
            return 2
        
        absa_1 = None
        absa_2 = None
        if self.absa_method != None:
            absa_1_list = self.absa_model.predict(review_1_aspect_sentences)
            absa_2_list = self.absa_model.predict(review_2_aspect_sentences)
            
            if self.absa_method == 'min':
                absa_1 = min(absa_1_list)
                absa_2 = min(absa_2_list)
            elif self.absa_method == 'max':
                absa_1 = max(absa_1_list)
                absa_2 = max(absa_2_list)
            elif self.absa_method == 'avg':
                absa_1 = sum(absa_1_list) / len(absa_1_list)
                absa_2 = sum(absa_2_list) / len(absa_2_list) 

        review_1_tuple = ", ".join(review_1_aspect_sentences)
        review_2_tuple = ", ".join(review_2_aspect_sentences)
        
        result = self._get_review_comparative(review_1_tuple, review_2_tuple, absa_1, absa_2)
        return int(torch.argmax(result, dim=1)[0]) - 1

In [9]:
# Define labels
labels = [-1, 0, 1, 2]
positive_labels = [-1, 0, 1]  # Positive labels: A<B, A=B, A>B
negative_label = 2            # Negative label: No comparison

from collections import defaultdict
def compute_metrics(true_labels, pred_labels):
        # Initialize dictionaries to count
    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)
    
    # Process each sample
    for true, pred in zip(true_labels, pred_labels):
        if true == pred:
            # True Positive: Only count for positive labels
            if true in positive_labels:
                tp[true] += 1
        else:
            # True is positive, Prediction is null (missing)
            if true in positive_labels and pred == negative_label:
                fn[true] += 1  # Only increase FN
            
            # True is null, Prediction is positive (excess)
            elif true == negative_label and pred in positive_labels:
                fp[pred] += 1  # Only increase FP
            
            # Confusion between positive labels
            elif true in positive_labels and pred in positive_labels:
                fn[true] += 1  # Missing the correct label
                fp[pred] += 1  # Excess of the predicted label
    
    return tp, fp, fn

In [10]:
from tqdm import tqdm

# Define a function to run evaluation for all aspects
def overall_evaluate_all_aspects(sampling_method, absa_method, eval_dataset, ablation):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    
    # Dictionary to store predictions for each aspect
    aspect_predictions = {}
    
    # Run each aspect model and collect predictions
    for aspect in aspects:
        rlcc_sampling_method = sampling_method
        if sampling_method == "class_weight":
            rlcc_sampling_method = "class-weight"
        if ablation == "aspect_classifier":
            model = RLCCBERTModelNoAs.from_pretrained(f"trungpq/rlcc-new-{aspect}-{sampling_method}-absa-{absa_method}-{ablation}").to(device)
        else:    
            model = OverallMode(
                aspect, 
                sampling_method,
                absa_method,
                ablation,
                device
            )
        
        predictions = []
        for sample in tqdm(eval_dataset):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
        
        # Store predictions for this aspect
        aspect_predictions[aspect] = predictions
        del model
    
    # Combine all predictions and true labels into flat lists
    all_predictions_flat = []
    all_true_labels_flat = []
    
    for i in range(len(eval_dataset)):
        for aspect in aspects:
            all_predictions_flat.append(aspect_predictions[aspect][i])
            all_true_labels_flat.append(eval_dataset[i][aspect])
    
    # Calculate TP, FP, FN
    tp, fp, fn = compute_metrics(all_true_labels_flat, all_predictions_flat)
    
    # Calculate precision, recall, F1 for each class
    class_metrics = {}
    for label in positive_labels:
        # Skip if no instances for this class
        if tp[label] + fp[label] == 0:
            precision = 0
        else:
            precision = tp[label] / (tp[label] + fp[label])
            
        if tp[label] + fn[label] == 0:
            recall = 0
        else:
            recall = tp[label] / (tp[label] + fn[label])
            
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        class_metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp[label],
            'fp': fp[label],
            'fn': fn[label]
        }
    
    # Calculate macro metrics (average over classes)
    macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
    macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
    macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
    
    # Calculate micro metrics (aggregate TP, FP, FN)
    total_tp = sum(tp[label] for label in positive_labels)
    total_fp = sum(fp[label] for label in positive_labels)
    total_fn = sum(fn[label] for label in positive_labels)
    
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
    
    # Return the combined results
    return {
        'method': f"{sampling_method}_absa-{absa_method}",
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'micro_precision': micro_precision,
        'macro_precision': macro_precision,
        'micro_recall': micro_recall,
        'macro_recall': macro_recall
    }

In [11]:
# Define a function to evaluate each aspect separately
def aspect_wise_evaluate(sampling_method, absa_method, eval_dataset, ablation):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    aspect_results = {}
    
    for aspect in aspects:
        print(f"Evaluating {aspect} model for aspect-wise metrics...")
        rlcc_sampling_method = sampling_method
        if sampling_method == "class_weight":
            rlcc_sampling_method = "class-weight"
        model = OverallMode(
            aspect, 
            sampling_method,
            absa_method, 
            ablation,
            device
        )
        
        predictions = []
        true_labels = []
        
        for i, sample in enumerate(tqdm(eval_dataset)):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
            true_labels.append(sample[aspect])
        
        # Calculate TP, FP, FN for this aspect
        tp, fp, fn = compute_metrics(true_labels, predictions)
        
        # Calculate precision, recall, F1 for each class
        class_metrics = {}
        for label in positive_labels:
            # Skip if no instances for this class
            if tp[label] + fp[label] == 0:
                precision = 0
            else:
                precision = tp[label] / (tp[label] + fp[label])
                
            if tp[label] + fn[label] == 0:
                recall = 0
            else:
                recall = tp[label] / (tp[label] + fn[label])
                
            if precision + recall == 0:
                f1 = 0
            else:
                f1 = 2 * precision * recall / (precision + recall)
                
            class_metrics[label] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'tp': tp[label],
                'fp': fp[label],
                'fn': fn[label]
            }
        
        # Calculate macro metrics (average over classes)
        macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
        macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
        macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
        
        # Calculate micro metrics (aggregate TP, FP, FN)
        total_tp = sum(tp[label] for label in positive_labels)
        total_fp = sum(fp[label] for label in positive_labels)
        total_fn = sum(fn[label] for label in positive_labels)
        
        micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
        
        aspect_results[aspect] = {
            'micro_precision': micro_precision,
            'micro_recall': micro_recall,
            'micro_f1': micro_f1,
            'macro_precision': macro_precision,
            'macro_recall': macro_recall,
            'macro_f1': macro_f1,
            'class_metrics': class_metrics
        }
        
        print(f"{aspect} - Micro P/R/F1: {micro_precision:.4f}/{micro_recall:.4f}/{micro_f1:.4f}")
        print(f"{aspect} - Macro P/R/F1: {macro_precision:.4f}/{macro_recall:.4f}/{macro_f1:.4f}")
        
        del model
    
    return aspect_results

In [12]:
import datasets

eval_dataset = datasets.load_dataset("lengocquangLAB/beer-com-reviews", split="test")


for (sampling_method, absa_method, ablation) in [('upsample_replacement', 'min', 'semantic_based'),
                                                ('none', 'min', 'imbalance_handling'),
                                                ('upsample_replacement', 'min', 'sum_distribution'),
                                                ('upsample_replacement', None, 'score_prediction')]:
        print("-"*50)
        print(f"Evaluating with sampling method: {sampling_method}, ABSA method: {absa_method}, ablation: {ablation}")
        results = overall_evaluate_all_aspects(sampling_method, absa_method, eval_dataset, ablation)
        print(results)
        print()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/977k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/133k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3320 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/420 [00:00<?, ? examples/s]

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: min, ablation: semantic_based


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

100%|██████████| 410/410 [00:30<00:00, 13.26it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:29<00:00, 13.80it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:28<00:00, 14.29it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 12.97it/s]


{'method': 'upsample_replacement_absa-min', 'micro_f1': 0.3021442495126706, 'macro_f1': 0.2802159604281516, 'micro_precision': 0.31313131313131315, 'macro_precision': 0.3327973248374605, 'micro_recall': 0.2919020715630885, 'macro_recall': 0.3198705309835284}

--------------------------------------------------
Evaluating with sampling method: none, ABSA method: min, ablation: imbalance_handling


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:32<00:00, 12.62it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.05it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:30<00:00, 13.27it/s]


config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/876M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:34<00:00, 11.76it/s]


{'method': 'none_absa-min', 'micro_f1': 0.5307506053268765, 'macro_f1': 0.5347395075225636, 'micro_precision': 0.5463609172482552, 'macro_precision': 0.5464318223754074, 'micro_recall': 0.5160075329566854, 'macro_recall': 0.5303119551026008}

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: min, ablation: sum_distribution


config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:30<00:00, 13.41it/s]


config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:29<00:00, 13.68it/s]


config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:29<00:00, 14.11it/s]


config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:32<00:00, 12.76it/s]


{'method': 'upsample_replacement_absa-min', 'micro_f1': 0.4230019493177388, 'macro_f1': 0.4163488176664925, 'micro_precision': 0.4383838383838384, 'macro_precision': 0.4560597869996834, 'micro_recall': 0.4086629001883239, 'macro_recall': 0.4265116934045743}

--------------------------------------------------
Evaluating with sampling method: upsample_replacement, ABSA method: None, ablation: score_prediction


config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:29<00:00, 13.69it/s]


config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:29<00:00, 13.89it/s]


config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:28<00:00, 14.21it/s]


config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 410/410 [00:31<00:00, 13.19it/s]


{'method': 'upsample_replacement_absa-None', 'micro_f1': 0.5, 'macro_f1': 0.5037144073033343, 'micro_precision': 0.5181818181818182, 'macro_precision': 0.5174857340022356, 'micro_recall': 0.4830508474576271, 'macro_recall': 0.49810479189153506}



In [13]:
# Run aspect-wise evaluation
print("="*70)
print("ASPECT-WISE EVALUATION")
print("="*70)

for (sampling_method, absa_method, ablation) in [('upsample_replacement', 'min', 'semantic_based'),
                                                ('none', 'min', 'imbalance_handling'),
                                                ('upsample_replacement', 'min', 'sum_distribution'),
                                                ('upsample_replacement', None, 'score_prediction')]:
    print("-"*50)
    print(f"Aspect-wise evaluation with sampling method: {sampling_method}, ABSA method: {absa_method}, ablation: {ablation}")
    aspect_results = aspect_wise_evaluate(sampling_method, absa_method, eval_dataset, ablation)
    
    # Print summary for each aspect
    for aspect, metrics in aspect_results.items():
        print(f"\n{aspect.upper()} Results:")
        print(f"  Micro F1: {metrics['micro_f1']:.4f}")
        print(f"  Macro F1: {metrics['macro_f1']:.4f}")
    print()

ASPECT-WISE EVALUATION
--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: min, ablation: semantic_based
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.61it/s]


appearance - Micro P/R/F1: 0.2970/0.2937/0.2953
appearance - Macro P/R/F1: 0.0990/0.3292/0.1522
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 13.83it/s]


aroma - Micro P/R/F1: 0.2977/0.2700/0.2832
aroma - Macro P/R/F1: 0.0992/0.2963/0.1487
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:28<00:00, 14.28it/s]


palate - Micro P/R/F1: 0.4028/0.3204/0.3569
palate - Macro P/R/F1: 0.1343/0.2544/0.1758
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 13.04it/s]


taste - Micro P/R/F1: 0.2986/0.2907/0.2946
taste - Macro P/R/F1: 0.0995/0.3303/0.1530

APPEARANCE Results:
  Micro F1: 0.2953
  Macro F1: 0.1522

AROMA Results:
  Micro F1: 0.2832
  Macro F1: 0.1487

PALATE Results:
  Micro F1: 0.3569
  Macro F1: 0.1758

TASTE Results:
  Micro F1: 0.2946
  Macro F1: 0.1530

--------------------------------------------------
Aspect-wise evaluation with sampling method: none, ABSA method: min, ablation: imbalance_handling
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:32<00:00, 12.59it/s]


appearance - Micro P/R/F1: 0.5795/0.5688/0.5741
appearance - Macro P/R/F1: 0.5761/0.5879/0.5761
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 13.03it/s]


aroma - Micro P/R/F1: 0.5775/0.5190/0.5467
aroma - Macro P/R/F1: 0.5754/0.5230/0.5450
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.31it/s]


palate - Micro P/R/F1: 0.4817/0.4365/0.4580
palate - Macro P/R/F1: 0.4866/0.4370/0.4547
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:34<00:00, 11.88it/s]


taste - Micro P/R/F1: 0.5331/0.5147/0.5237
taste - Macro P/R/F1: 0.5315/0.5309/0.5256

APPEARANCE Results:
  Micro F1: 0.5741
  Macro F1: 0.5761

AROMA Results:
  Micro F1: 0.5467
  Macro F1: 0.5450

PALATE Results:
  Micro F1: 0.4580
  Macro F1: 0.4547

TASTE Results:
  Micro F1: 0.5237
  Macro F1: 0.5256

--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: min, ablation: sum_distribution
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:30<00:00, 13.34it/s]


appearance - Micro P/R/F1: 0.4436/0.4387/0.4411
appearance - Macro P/R/F1: 0.4460/0.4473/0.4377
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 13.68it/s]


aroma - Micro P/R/F1: 0.5488/0.4979/0.5221
aroma - Macro P/R/F1: 0.5804/0.5270/0.5160
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:28<00:00, 14.18it/s]


palate - Micro P/R/F1: 0.3958/0.3149/0.3508
palate - Macro P/R/F1: 0.4091/0.3327/0.3532
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 12.86it/s]


taste - Micro P/R/F1: 0.3863/0.3760/0.3811
taste - Macro P/R/F1: 0.4211/0.3946/0.3650

APPEARANCE Results:
  Micro F1: 0.4411
  Macro F1: 0.4377

AROMA Results:
  Micro F1: 0.5221
  Macro F1: 0.5160

PALATE Results:
  Micro F1: 0.3508
  Macro F1: 0.3532

TASTE Results:
  Micro F1: 0.3811
  Macro F1: 0.3650

--------------------------------------------------
Aspect-wise evaluation with sampling method: upsample_replacement, ABSA method: None, ablation: score_prediction
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 13.80it/s]


appearance - Micro P/R/F1: 0.5602/0.5539/0.5570
appearance - Macro P/R/F1: 0.5645/0.5626/0.5631
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:29<00:00, 14.00it/s]


aroma - Micro P/R/F1: 0.5721/0.5190/0.5442
aroma - Macro P/R/F1: 0.5693/0.5291/0.5471
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:28<00:00, 14.27it/s]


palate - Micro P/R/F1: 0.3958/0.3149/0.3508
palate - Macro P/R/F1: 0.3924/0.3291/0.3515
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:31<00:00, 13.19it/s]

taste - Micro P/R/F1: 0.5041/0.4907/0.4973
taste - Macro P/R/F1: 0.4999/0.5107/0.4952

APPEARANCE Results:
  Micro F1: 0.5570
  Macro F1: 0.5631

AROMA Results:
  Micro F1: 0.5442
  Macro F1: 0.5471

PALATE Results:
  Micro F1: 0.3508
  Macro F1: 0.3515

TASTE Results:
  Micro F1: 0.4973
  Macro F1: 0.4952




