In [None]:
# Cell 1: Robust Library Installation
# Step 1: Clean up any potentially conflicting versions
print("--> Uninstalling existing versions...")
!pip uninstall -y transformers accelerate datasets torch torchvision

# Step 2: Reinstall a stable, compatible set of the core libraries
print("\n--> Reinstalling core libraries...")
!pip install transformers accelerate datasets torch torchvision

# Step 3: Install the remaining libraries
print("\n--> Installing other required libraries...")
!pip install scikit-learn pandas imbalanced-learn -q

print("\n✅ All libraries have been reinstalled.")

--> Uninstalling existing versions...
Found existing installation: transformers 4.55.1
Uninstalling transformers-4.55.1:
  Successfully uninstalled transformers-4.55.1
Found existing installation: accelerate 1.10.0
Uninstalling accelerate-1.10.0:
  Successfully uninstalled accelerate-1.10.0
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124

--> Reinstalling core libraries...
Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl

In [None]:
# Cell 2: Check Library Versions
# This output should be noted for the 'reproducibility' section of the paper
!pip freeze | grep -E "transformers|torch|scikit-learn|datasets"

datasets==4.0.0
scikit-learn==1.6.1
sentence-transformers==5.1.0
tensorflow-datasets==4.9.9
torch==2.8.0
torchao==0.10.0
torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl
torchdata==0.11.0
torchsummary==1.5.1
torchtune==0.6.1
torchvision==0.23.0
transformers==4.55.2
vega-datasets==0.9.0


In [None]:
# Cell 3: Connect to Drive, Setup Paths & Initialize
from google.colab import drive
import pandas as pd
import numpy as np
import pickle
import os
import json
import torch
from torch import nn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from transformers import (
    AutoTokenizer,
    AutoConfig, # <-- Import AutoConfig
    Trainer,
    TrainingArguments,
    BertPreTrainedModel,
    BertModel
)
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset

# Mount Google Drive
drive.mount('/content/drive')

# --- KEY SETTINGS ---
SEED = 42
GDRIVE_PATH = '/content/drive/MyDrive/eecsi_revise/'
MODEL_NAME = "indobenchmark/indobert-base-p1"

# --- NEW: Define a dedicated path for MTL results ---
MTL_RESULTS_PATH = os.path.join(GDRIVE_PATH, 'mtl_results/')
os.makedirs(MTL_RESULTS_PATH, exist_ok=True)

# Set seed for reproducibility across all libraries
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

print(f"✅ Setup complete. MTL checkpoints and results will be saved to: {MTL_RESULTS_PATH}")

Mounted at /content/drive
✅ Setup complete. MTL checkpoints and results will be saved to: /content/drive/MyDrive/eecsi_revise/mtl_results/


In [None]:
# Cell 4: Load and Prepare Data for MTL
file_path_csv = os.path.join(GDRIVE_PATH, 'final_golden_dataset_eecsi.csv')
df = pd.read_csv(file_path_csv)

# --- CRITICAL: Use relevant data only ---
relevant_df = df[df['aspect'] != 'Irrelevant'].copy()
print(f"Using {len(relevant_df)} relevant rows for Multi-Task Learning.")

Using 2037 relevant rows for Multi-Task Learning.


In [None]:
# Cell 5: Define the Multi-Task Learning Model Architecture
from torch import nn
from transformers import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoConfig # Make sure AutoConfig is imported

class IndoBERT_MTL(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # Store the number of labels for each task from the model's config
        self.num_aspect_labels = config.num_aspect_labels
        self.num_sentiment_labels = config.num_sentiment_labels

        # Shared Body: The core IndoBERT model
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # Head 1: The Aspect Classifier
        self.aspect_classifier = nn.Linear(config.hidden_size, self.num_aspect_labels)

        # Head 2: The Sentiment Classifier
        self.sentiment_classifier = nn.Linear(config.hidden_size, self.num_sentiment_labels)

        # Initialize the weights of the new layers
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        aspect_labels=None,    # Custom label name for aspects
        sentiment_labels=None, # Custom label name for sentiments
        return_dict=None,
        **kwargs,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs through the shared BERT body
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=return_dict,
        )

        # Use the pooled output for classification
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        # Get logits (raw predictions) from both heads
        aspect_logits = self.aspect_classifier(pooled_output)
        sentiment_logits = self.sentiment_classifier(pooled_output)

        total_loss = 0
        loss_fct = nn.CrossEntropyLoss()

        # --- Calculate the Combined Loss ---
        # 1. Aspect Loss (if labels are provided)
        if aspect_labels is not None:
            loss_aspect = loss_fct(aspect_logits.view(-1, self.num_aspect_labels), aspect_labels.view(-1))
            total_loss += loss_aspect

        # 2. Sentiment Loss (if labels are provided)
        if sentiment_labels is not None:
            loss_sentiment = loss_fct(sentiment_logits.view(-1, self.num_sentiment_labels), sentiment_labels.view(-1))
            total_loss += loss_sentiment

        # The Trainer API expects a specific output format
        return SequenceClassifierOutput(
            loss=total_loss,
            logits=(aspect_logits, sentiment_logits), # Return both logits as a tuple
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

print("✅ Custom MTL model class 'IndoBERT_MTL' defined.")

✅ Custom MTL model class 'IndoBERT_MTL' defined.


In [None]:
# Cell 6: Prepare Helper Functions & Classes for MTL

# 1. Custom PyTorch Dataset for MTL
class MTLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, aspect_labels, sentiment_labels):
        self.encodings = encodings
        self.aspect_labels = aspect_labels
        self.sentiment_labels = sentiment_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # This dataset returns two different labels
        item['aspect_labels'] = torch.tensor(self.aspect_labels[idx])
        item['sentiment_labels'] = torch.tensor(self.sentiment_labels[idx])
        return item

    def __len__(self):
        return len(self.aspect_labels)

# 2. Custom compute_metrics function for MTL
def compute_metrics_mtl(p):
    # p.predictions is now a tuple of two arrays (aspect_preds, sentiment_preds)
    aspect_preds = np.argmax(p.predictions[0], axis=1)
    sentiment_preds = np.argmax(p.predictions[1], axis=1)

    # p.label_ids is also a tuple of two arrays
    aspect_labels = p.label_ids[0]
    sentiment_labels = p.label_ids[1]

    # Calculate metrics for each task separately
    aspect_report = classification_report(aspect_labels, aspect_preds, output_dict=True, zero_division=0)
    sentiment_report = classification_report(sentiment_labels, sentiment_preds, output_dict=True, zero_division=0)

    # Return a dictionary with metrics for both tasks
    return {
        "aspect_macro_f1": aspect_report["macro avg"]["f1-score"],
        "sentiment_macro_f1": sentiment_report["macro avg"]["f1-score"]
    }

print("✅ MTL-specific helper functions and classes are ready.")

✅ MTL-specific helper functions and classes are ready.


In [None]:
# Cell 7: Run the 5-Fold Cross-Validation for MTL
from transformers import AutoConfig # Make sure AutoConfig is imported

fold_results = []
X = relevant_df['cleaned_text']
y_aspect = relevant_df['aspect']
y_sentiment = relevant_df['sentiment']

# --- Create Label Mappings for BOTH tasks ---
aspect_labels_list = np.array(sorted(y_aspect.unique()))
sentiment_labels_list = np.array(sorted(y_sentiment.unique()))

aspect_label2id = {label: i for i, label in enumerate(aspect_labels_list)}
sentiment_label2id = {label: i for i, label in enumerate(sentiment_labels_list)}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# We stratify by aspect as it is the more complex task with more classes
for i, (train_index, test_index) in enumerate(skf.split(X, y_aspect)):
    print(f"--- Running Fold {i+1}/5 ---")

    train_df = relevant_df.iloc[train_index]
    test_df = relevant_df.iloc[test_index]

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_encodings = tokenizer(list(train_df['cleaned_text']), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(list(test_df['cleaned_text']), truncation=True, padding=True, max_length=128)

    # Create two sets of labels for training and testing
    train_aspect_labels = [aspect_label2id[label] for label in train_df['aspect']]
    train_sentiment_labels = [sentiment_label2id[label] for label in train_df['sentiment']]
    test_aspect_labels = [aspect_label2id[label] for label in test_df['aspect']]
    test_sentiment_labels = [sentiment_label2id[label] for label in test_df['sentiment']]

    train_dataset = MTLDataset(train_encodings, train_aspect_labels, train_sentiment_labels)
    test_dataset = MTLDataset(test_encodings, test_aspect_labels, test_sentiment_labels)

    # Get model config and add our custom parameters for the two heads
    config = AutoConfig.from_pretrained(MODEL_NAME)
    config.num_aspect_labels = len(aspect_labels_list)
    config.num_sentiment_labels = len(sentiment_labels_list)

    # Initialize our custom MTL model
    model = IndoBERT_MTL.from_pretrained(MODEL_NAME, config=config)

    if i == 0:
        print(f"Model Parameters: {model.num_parameters()/1e6:.2f}M")

    training_args = TrainingArguments(
        output_dir=os.path.join(MTL_RESULTS_PATH, f'fold_{i+1}'),
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        # We need to pick one primary metric to decide the "best" model checkpoint
        load_best_model_at_end=True,
        metric_for_best_model="eval_aspect_macro_f1",
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        seed=SEED,
        # This is required because our dataset has custom column names ('aspect_labels', etc.)
        remove_unused_columns=False,
    )

    # Use the standard Trainer, not the weighted one
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_mtl,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    fold_results.append(eval_results)
    print(f"Fold {i+1} complete. Evaluation results: {eval_results}")

print("\n✅ 5-fold cross-validation process for MTL finished.")

--- Running Fold 1/5 ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of IndoBERT_MTL were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['aspect_classifier.bias', 'aspect_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Parameters: 124.45M


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mitaeyeong2532[0m ([33mitaeyeong2532-telkom-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.3206,1.305656,0.683404,0.722975
2,0.9646,1.126179,0.744188,0.777297
3,0.5428,1.111874,0.769867,0.804804
4,0.3084,1.144168,0.760813,0.794253
5,0.2251,1.160787,0.779988,0.794279


Fold 1 complete. Evaluation results: {'eval_loss': 1.1607868671417236, 'eval_aspect_macro_f1': 0.779987544905294, 'eval_sentiment_macro_f1': 0.7942787486411648, 'eval_runtime': 1.8574, 'eval_samples_per_second': 219.664, 'eval_steps_per_second': 13.998, 'epoch': 5.0}
--- Running Fold 2/5 ---


Some weights of IndoBERT_MTL were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['aspect_classifier.bias', 'aspect_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.4143,1.267312,0.736778,0.770059
2,0.874,0.996004,0.801869,0.816882
3,0.528,0.944974,0.791061,0.841673
4,0.3139,0.965674,0.802172,0.837858
5,0.1891,0.955206,0.800426,0.839799


Fold 2 complete. Evaluation results: {'eval_loss': 0.9656739830970764, 'eval_aspect_macro_f1': 0.8021717458058394, 'eval_sentiment_macro_f1': 0.8378578619340097, 'eval_runtime': 1.7298, 'eval_samples_per_second': 235.863, 'eval_steps_per_second': 15.031, 'epoch': 5.0}
--- Running Fold 3/5 ---


Some weights of IndoBERT_MTL were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['aspect_classifier.bias', 'aspect_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.4845,1.367788,0.730675,0.730397
2,0.8878,1.155798,0.810074,0.759527
3,0.5456,1.041584,0.801549,0.76478
4,0.3523,1.090245,0.820407,0.762553
5,0.2333,1.081214,0.818748,0.750631


Fold 3 complete. Evaluation results: {'eval_loss': 1.0902454853057861, 'eval_aspect_macro_f1': 0.8204069355828484, 'eval_sentiment_macro_f1': 0.7625531378929796, 'eval_runtime': 1.5022, 'eval_samples_per_second': 270.936, 'eval_steps_per_second': 17.308, 'epoch': 5.0}
--- Running Fold 4/5 ---


Some weights of IndoBERT_MTL were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['aspect_classifier.bias', 'aspect_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.4649,1.46995,0.68003,0.68863
2,0.8617,1.119455,0.757273,0.756471
3,0.4521,1.11,0.76309,0.804787
4,0.3288,1.079245,0.792005,0.810959
5,0.1789,1.10889,0.791425,0.808455


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.4649,1.46995,0.68003,0.68863
2,0.8617,1.119455,0.757273,0.756471
3,0.4521,1.11,0.76309,0.804787
4,0.3288,1.079245,0.792005,0.810959
5,0.1789,1.10889,0.791425,0.808455


Fold 4 complete. Evaluation results: {'eval_loss': 1.0792453289031982, 'eval_aspect_macro_f1': 0.7920050461767786, 'eval_sentiment_macro_f1': 0.8109594559564317, 'eval_runtime': 1.7252, 'eval_samples_per_second': 235.91, 'eval_steps_per_second': 15.07, 'epoch': 5.0}
--- Running Fold 5/5 ---


Some weights of IndoBERT_MTL were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['aspect_classifier.bias', 'aspect_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Aspect Macro F1,Sentiment Macro F1
1,1.4166,1.347684,0.7051,0.717638
2,0.808,1.121005,0.771886,0.759877
3,0.4757,1.12164,0.770763,0.773453
4,0.2932,1.131756,0.782169,0.795301
5,0.1669,1.14046,0.779546,0.789071


Fold 5 complete. Evaluation results: {'eval_loss': 1.1317561864852905, 'eval_aspect_macro_f1': 0.78216862309502, 'eval_sentiment_macro_f1': 0.7953013589557779, 'eval_runtime': 1.6139, 'eval_samples_per_second': 252.187, 'eval_steps_per_second': 16.11, 'epoch': 5.0}

✅ 5-fold cross-validation process for MTL finished.


In [None]:
# Cell 8: Aggregate and Display Final Results

# Extract scores for both tasks from the results list
aspect_f1_scores = [result['eval_aspect_macro_f1'] for result in fold_results]
sentiment_f1_scores = [result['eval_sentiment_macro_f1'] for result in fold_results]

# Calculate mean and std for the Aspect Detection task
mean_aspect_f1 = np.mean(aspect_f1_scores)
std_aspect_f1 = np.std(aspect_f1_scores)

# Calculate mean and std for the Sentiment Classification task
mean_sentiment_f1 = np.mean(sentiment_f1_scores)
std_sentiment_f1 = np.std(sentiment_f1_scores)

print("--- Final Aggregated Results (5-Fold CV) for MTL Model ---")
print(f"Aspect Detection Macro F1-Score = {mean_aspect_f1:.4f} ± {std_aspect_f1:.4f}")
print(f"Sentiment Classification Macro F1-Score = {mean_sentiment_f1:.4f} ± {std_sentiment_f1:.4f}")

--- Final Aggregated Results (5-Fold CV) for MTL Model ---
Aspect Detection Macro F1-Score = 0.7953 ± 0.0148
Sentiment Classification Macro F1-Score = 0.8002 ± 0.0245


In [None]:
# Cell 9: Save Results to Google Drive
final_results = {
    'model': 'IndoBERT (MTL)',
    'mean_aspect_macro_f1': mean_aspect_f1,
    'std_dev_aspect_macro_f1': std_aspect_f1,
    'mean_sentiment_macro_f1': mean_sentiment_f1,
    'std_dev_sentiment_macro_f1': std_sentiment_f1,
    'results_per_fold': fold_results
}

results_file_path = os.path.join(MTL_RESULTS_PATH, 'results_mtl.json')
with open(results_file_path, 'w') as f:
    json.dump(final_results, f, indent=4)

print(f"\n✅ Final results for MTL have been saved to: '{results_file_path}'")


✅ Final results for MTL have been saved to: '/content/drive/MyDrive/eecsi_revise/mtl_results/results_mtl.json'
