In [1]:
import torch
import torch.nn as nn
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import json
import fasttext
import numpy as np
import xgboost as xgb

import warnings

# Bỏ qua tất cả cảnh báo UserWarning (trong đó có InconsistentVersionWarning cũ)
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

import joblib
import os


nltk.download('punkt')
nltk.download('punkt_tab')
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz   # extracts cc.en.300.bin

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load FastText model and embedding function (same as SVM file)
print("Loading fastText model...")
fasttext_model = fasttext.load_model("cc.en.300.bin")

--2025-11-13 15:13:33--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.51, 3.163.189.14, 3.163.189.108, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.51|:443... connected.


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-11-13 15:13:46 (357 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]

Loading fastText model...


In [8]:
def get_embedding(text):
    return fasttext_model.get_sentence_vector(str(text))

class SentenceAspectXGB:
    def __init__(self, aspect, model_path=None):
        self.aspect = aspect
        if model_path is None:
            model_path = f"/kaggle/input/sudo-xgboost-checkpoint-phase1/{aspect}_best_model.json"
        self.model = xgb.Booster()
        try:
            self.model.load_model(model_path)
        except Exception as e:
            print(f"Sentence XGB model not found or failed to load ({model_path}): {e}")
            self.model = None

    def filter_positive_sentences(self, sentences, threshold=0.5):
        if not sentences:
            return []
        if self.model is None:
            # Fallback: keep all to not break downstream logic
            return sentences
        try:
            X = np.vstack([get_embedding(s) for s in sentences])
            dmat = xgb.DMatrix(X)
            probs = self.model.predict(dmat)
            return [s for s, p in zip(sentences, probs) if float(p) > threshold]
        except Exception as e:
            print(f"Sentence XGB predict failed: {e}")
            return sentences

class XGBoostComparativeModel:
    def __init__(self, aspect):
        # Use global fastText embedding; do not load another fastText model
        self.aspect = aspect
        self.xgb_model = xgb.Booster()
        model_path = f"/kaggle/input/sudo-xgboost-checkpoint-phase2/{aspect}_best_model.json"
        self.xgb_model.load_model(model_path)
        self.label2id = {-1: 0, 0: 1, 1: 2}
        self.id2label = {v: k for k, v in self.label2id.items()}
    
    def predict(self, text_1, text_2):
        emb_1 = get_embedding(text_1)
        emb_2 = get_embedding(text_2)
        features = np.concatenate([emb_1, emb_2]).reshape(1, -1)
        dtest = xgb.DMatrix(features)
        pred_id = int(self.xgb_model.predict(dtest)[0])
        return self.id2label[pred_id]


In [9]:
class OverallMode(nn.Module):
    def __init__(self, aspect, device):
        super().__init__()
        self.aspect = aspect
        self.device = device
        
        self.aspect_index = {'appearance': 0, 'aroma': 1, 'palate': 2, 'taste': 3}
        self.aspect_idx = self.aspect_index[aspect]  
        
        # Replace SLAC-BERT with sentence-level XGBoost
        self.sentence_xgb = SentenceAspectXGB(aspect)
        
        # Load XGBoost model for comparative classification (unchanged)
        self.xgb_model = XGBoostComparativeModel(aspect)
            
    @staticmethod
    def _truncate_seq(tokens, max_length):
        while True:
            total_length = len(tokens)
            if total_length <= max_length:
                break
            tokens.pop()
            
        return tokens
    
    @staticmethod
    def _split_clean_sentences(text):
        sentences = sent_tokenize(text.lower()) 
        sentences = [re.sub(r'\W+', ' ', s).strip() for s in sentences if len(word_tokenize(s)) > 1]  
        return sentences
    
    def _get_aspect_sentences(self, review_sentences):
        if len(review_sentences) == 0:
            return []
        # Use sentence-level XGBoost classifier to filter aspect sentences
        aspect_sentences = self.sentence_xgb.filter_positive_sentences(review_sentences)
        return aspect_sentences
        
    def forward(self, review_1, review_2):
        review_1_sent = self._split_clean_sentences(review_1)
        review_2_sent = self._split_clean_sentences(review_2)
        
        if len(review_1_sent) == 0 or len(review_2_sent) == 0:
            return 2
        
        review_1_aspect_sentences = self._get_aspect_sentences(review_1_sent)
        review_2_aspect_sentences = self._get_aspect_sentences(review_2_sent)
        
        if len(review_1_aspect_sentences) == 0 or len(review_2_aspect_sentences) == 0:
            return 2

        # Combine aspect sentences into single texts
        review_1_text = " ".join(review_1_aspect_sentences)
        review_2_text = " ".join(review_2_aspect_sentences)
        
        # Use XGBoost model for comparison
        try:
            result = self.xgb_model.predict(review_1_text, review_2_text)
            return result
        except Exception as e:
            print(f"XGBoost prediction failed: {e}")
            return 2


In [10]:
# Define labels
labels = [-1, 0, 1, 2]
positive_labels = [-1, 0, 1]  # Positive labels: A<B, A=B, A>B
negative_label = 2            # Negative label: No comparison

from collections import defaultdict
def compute_metrics(true_labels, pred_labels):
        # Initialize dictionaries to count
    tp = defaultdict(int)
    fp = defaultdict(int)
    fn = defaultdict(int)
    
    # Process each sample
    for true, pred in zip(true_labels, pred_labels):
        if true == pred:
            # True Positive: Only count for positive labels
            if true in positive_labels:
                tp[true] += 1
        else:
            # True is positive, Prediction is null (missing)
            if true in positive_labels and pred == negative_label:
                fn[true] += 1  # Only increase FN
            
            # True is null, Prediction is positive (excess)
            elif true == negative_label and pred in positive_labels:
                fp[pred] += 1  # Only increase FP
            
            # Confusion between positive labels
            elif true in positive_labels and pred in positive_labels:
                fn[true] += 1  # Missing the correct label
                fp[pred] += 1  # Excess of the predicted label
    
    return tp, fp, fn


In [11]:
from tqdm import tqdm

# Define a function to run evaluation for all aspects
def overall_evaluate_all_aspects(eval_dataset):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    
    # Dictionary to store predictions for each aspect
    aspect_predictions = {}
    
    # Run each aspect model and collect predictions
    for aspect in aspects:
        model = OverallMode(
            aspect, 
            device
        )
        
        predictions = []
        for sample in tqdm(eval_dataset):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
        
        # Store predictions for this aspect
        aspect_predictions[aspect] = predictions
        del model
    
    # Combine all predictions and true labels into flat lists
    all_predictions_flat = []
    all_true_labels_flat = []
    
    for i in range(len(eval_dataset)):
        for aspect in aspects:
            all_predictions_flat.append(aspect_predictions[aspect][i])
            all_true_labels_flat.append(eval_dataset[i][aspect])
    
    # Calculate TP, FP, FN
    tp, fp, fn = compute_metrics(all_true_labels_flat, all_predictions_flat)
    
    # Calculate precision, recall, F1 for each class
    class_metrics = {}
    for label in positive_labels:
        # Skip if no instances for this class
        if tp[label] + fp[label] == 0:
            precision = 0
        else:
            precision = tp[label] / (tp[label] + fp[label])
            
        if tp[label] + fn[label] == 0:
            recall = 0
        else:
            recall = tp[label] / (tp[label] + fn[label])
            
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        class_metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp[label],
            'fp': fp[label],
            'fn': fn[label]
        }
    
    # Calculate macro metrics (average over classes)
    macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
    macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
    macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
    
    # Calculate micro metrics (aggregate TP, FP, FN)
    total_tp = sum(tp[label] for label in positive_labels)
    total_fp = sum(fp[label] for label in positive_labels)
    total_fn = sum(fn[label] for label in positive_labels)
    
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
    
    # Return the combined results
    return {
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'micro_precision': micro_precision,
        'macro_precision': macro_precision,
        'micro_recall': micro_recall,
        'macro_recall': macro_recall
    }


In [12]:
# Define a function to evaluate each aspect separately
def aspect_wise_evaluate(eval_dataset):
    aspects = ['appearance', 'aroma', 'palate', 'taste']
    aspect_results = {}
    
    for aspect in aspects:
        print(f"Evaluating {aspect} model for aspect-wise metrics...")
        model = OverallMode(
            aspect, 
            device
        )
        
        predictions = []
        true_labels = []
        
        for i, sample in enumerate(tqdm(eval_dataset)):
            pred = model(sample['reviewText_1'], sample['reviewText_2'])
            predictions.append(pred)
            true_labels.append(sample[aspect])
        
        # Calculate TP, FP, FN for this aspect
        tp, fp, fn = compute_metrics(true_labels, predictions)
        
        # Calculate precision, recall, F1 for each class
        class_metrics = {}
        for label in positive_labels:
            # Skip if no instances for this class
            if tp[label] + fp[label] == 0:
                precision = 0
            else:
                precision = tp[label] / (tp[label] + fp[label])
                
            if tp[label] + fn[label] == 0:
                recall = 0
            else:
                recall = tp[label] / (tp[label] + fn[label])
                
            if precision + recall == 0:
                f1 = 0
            else:
                f1 = 2 * precision * recall / (precision + recall)
                
            class_metrics[label] = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'tp': tp[label],
                'fp': fp[label],
                'fn': fn[label]
            }
        
        # Calculate macro metrics (average over classes)
        macro_precision = sum(m['precision'] for m in class_metrics.values()) / len(positive_labels)
        macro_recall = sum(m['recall'] for m in class_metrics.values()) / len(positive_labels)
        macro_f1 = sum(m['f1'] for m in class_metrics.values()) / len(positive_labels)
        
        # Calculate micro metrics (aggregate TP, FP, FN)
        total_tp = sum(tp[label] for label in positive_labels)
        total_fp = sum(fp[label] for label in positive_labels)
        total_fn = sum(fn[label] for label in positive_labels)
        
        micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
        
        aspect_results[aspect] = {
            'micro_precision': micro_precision,
            'micro_recall': micro_recall,
            'micro_f1': micro_f1,
            'macro_precision': macro_precision,
            'macro_recall': macro_recall,
            'macro_f1': macro_f1,
            'class_metrics': class_metrics
        }
        
        print(f"{aspect} - Micro P/R/F1: {micro_precision:.4f}/{micro_recall:.4f}/{micro_f1:.4f}")
        print(f"{aspect} - Macro P/R/F1: {macro_precision:.4f}/{macro_recall:.4f}/{macro_f1:.4f}")
        
        del model
    
    return aspect_results


In [13]:
import datasets

eval_dataset = datasets.load_dataset("lengocquangLAB/beer-com-reviews", split="test")
results = overall_evaluate_all_aspects(eval_dataset)
print(results)


# Run aspect-wise evaluation
print("="*70)
print("ASPECT-WISE EVALUATION")
print("="*70)

aspect_results = aspect_wise_evaluate(eval_dataset)

# Print summary for each aspect
for aspect, metrics in aspect_results.items():
    print(f"\n{aspect.upper()} Results:")
    print(f"  Micro F1: {metrics['micro_f1']:.4f}")
    print(f"  Macro F1: {metrics['macro_f1']:.4f}")
print()

100%|██████████| 410/410 [00:01<00:00, 265.74it/s]
100%|██████████| 410/410 [00:01<00:00, 265.39it/s]
100%|██████████| 410/410 [00:01<00:00, 266.99it/s]
100%|██████████| 410/410 [00:01<00:00, 246.77it/s]


{'micro_f1': 0.46832814122533745, 'macro_f1': 0.464278736718615, 'micro_precision': 0.5219907407407407, 'macro_precision': 0.5415491783372973, 'micro_recall': 0.4246704331450094, 'macro_recall': 0.41536076167800334}
ASPECT-WISE EVALUATION
Evaluating appearance model for aspect-wise metrics...


100%|██████████| 410/410 [00:01<00:00, 260.15it/s]


appearance - Micro P/R/F1: 0.5514/0.4981/0.5234
appearance - Macro P/R/F1: 0.6287/0.4803/0.5152
Evaluating aroma model for aspect-wise metrics...


100%|██████████| 410/410 [00:01<00:00, 261.61it/s]


aroma - Micro P/R/F1: 0.5722/0.4346/0.4940
aroma - Macro P/R/F1: 0.5834/0.3991/0.4599
Evaluating palate model for aspect-wise metrics...


100%|██████████| 410/410 [00:01<00:00, 266.97it/s]


palate - Micro P/R/F1: 0.6204/0.3702/0.4637
palate - Macro P/R/F1: 0.6506/0.3629/0.4606
Evaluating taste model for aspect-wise metrics...


100%|██████████| 410/410 [00:01<00:00, 249.86it/s]

taste - Micro P/R/F1: 0.4414/0.3920/0.4153
taste - Macro P/R/F1: 0.4458/0.3980/0.4198

APPEARANCE Results:
  Micro F1: 0.5234
  Macro F1: 0.5152

AROMA Results:
  Micro F1: 0.4940
  Macro F1: 0.4599

PALATE Results:
  Micro F1: 0.4637
  Macro F1: 0.4606

TASTE Results:
  Micro F1: 0.4153
  Macro F1: 0.4198




