In [1]:
# Cell 1: Setup and Initialization
import os
import json
import pickle
import pandas as pd
import numpy as np
import torch
from torch import nn
from google.colab import drive
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    BertPreTrainedModel,
    BertModel
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.model_selection import StratifiedKFold

# Mount Google Drive
drive.mount('/content/drive')

# --- KEY SETTINGS ---
GDRIVE_PATH = '/content/drive/MyDrive/eecsi_revise/'
SEED = 42

print(f"✅ Setup complete. Working inside folder: {GDRIVE_PATH}")

Mounted at /content/drive
✅ Setup complete. Working inside folder: /content/drive/MyDrive/eecsi_revise/


In [2]:
# Cell 2: Load Data and Initial ACD Predictions

print("--- Loading base data and pre-generated ACD predictions ---")

try:
    # Load the main dataframe
    df = pd.read_csv(os.path.join(GDRIVE_PATH, 'final_golden_dataset_eecsi.csv'))

    # Load the 5-fold split definitions
    with open(os.path.join(GDRIVE_PATH, 'kfold_splits.pkl'), 'rb') as f:
        kfold_splits = pickle.load(f)

    # Load the raw predictions file which contains results from the ACD stage
    # This file should have predictions for 'indobertweet_pipeline_acd'
    with open(os.path.join(GDRIVE_PATH, 'all_model_predictions.pkl'), 'rb') as f:
        all_predictions = pickle.load(f)

    print("✅ All necessary files loaded successfully.")
    print(f"Dataset loaded with {len(df)} rows.")

except FileNotFoundError as e:
    print(f"❌ ERROR: A required file was not found. Details: {e}")
    raise

--- Loading base data and pre-generated ACD predictions ---
✅ All necessary files loaded successfully.
Dataset loaded with 3030 rows.


In [4]:
# Cell 3: Generate ASC Predictions for the Best Pipeline Model (REVISED)

class PredictionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings): self.encodings = encodings
    def __getitem__(self, idx): return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self): return len(self.encodings.input_ids)

print("\n--- Generating predictions for the PIPELINE's ASC stage... ---")
ASC_MODEL_NAME = "indolem/indobertweet-base-uncased"
ASC_RESULTS_PATH = os.path.join(GDRIVE_PATH, 'indobertweet_asc_results/')
TEXT_COLUMN = 'full_text' if 'full_text' in df.columns else 'cleaned_text'

relevant_df = df[df['aspect'] != 'Irrelevant'].copy()
sentiment_labels_list = sorted(relevant_df['sentiment'].unique())
s_label2id = {l: i for i, l in enumerate(sentiment_labels_list)}
s_id2label = {i: l for i, l in enumerate(sentiment_labels_list)}

tokenizer_asc = AutoTokenizer.from_pretrained(ASC_MODEL_NAME)
asc_predictions = pd.Series([None] * len(relevant_df), index=relevant_df.index)

skf_asc = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for i, (train_index, test_index) in enumerate(skf_asc.split(relevant_df[TEXT_COLUMN], relevant_df['sentiment'])):
    fold_num = i + 1
    print(f"  Processing Pipeline ASC Fold {fold_num}/5...")
    fold_dir = os.path.join(ASC_RESULTS_PATH, f'fold_{fold_num}')

    # --- [FIX] More thorough logic to find the TRUE best checkpoint ---
    try:
        # For pipeline models, the structure is often simpler.
        # Let's check the root of the fold directory first, as that's a common pattern.
        state_path = os.path.join(fold_dir, 'trainer_state.json')
        with open(state_path, 'r') as f:
            state = json.load(f)
        best_checkpoint_path = state['best_model_checkpoint']
        print(f"    ✅ Found best model via trainer_state.json in root: {os.path.basename(best_checkpoint_path)}")
    except (FileNotFoundError, KeyError):
        # Fallback to the logic for nested checkpoint directories
        try:
            possible_checkpoints = [d for d in os.listdir(fold_dir) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(fold_dir, d))]
            latest_checkpoint_dir = sorted(possible_checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
            state_path = os.path.join(fold_dir, latest_checkpoint_dir, 'trainer_state.json')
            with open(state_path, 'r') as f:
                state = json.load(f)
            saved_best_path = state['best_model_checkpoint']
            best_checkpoint_name = os.path.basename(saved_best_path)
            best_checkpoint_path = os.path.join(fold_dir, best_checkpoint_name)
            print(f"    ✅ Found best model via nested trainer_state.json: {best_checkpoint_name}")
        except Exception as e:
             print(f"    ⚠️ Warning: Could not determine best model from any trainer_state.json (Reason: {e}). Using latest checkpoint as final fallback.")
             possible_checkpoints = [d for d in os.listdir(fold_dir) if d.startswith('checkpoint-')]
             best_checkpoint_path = os.path.join(fold_dir, sorted(possible_checkpoints, key=lambda x: int(x.split('-')[-1]))[-1])
    # --- END OF FIX ---

    model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint_path)
    trainer = Trainer(model=model)
    test_data = relevant_df.iloc[test_index]
    test_encodings = tokenizer_asc(list(test_data[TEXT_COLUMN]), truncation=True, padding=True, max_length=128)
    prediction_dataset = PredictionDataset(test_encodings)
    predictions = trainer.predict(prediction_dataset)
    predicted_labels_int = np.argmax(predictions.predictions, axis=1)
    asc_predictions.iloc[test_index] = predicted_labels_int

final_asc_preds = asc_predictions.map(s_id2label)
print("\n✅ Pipeline ASC prediction generation complete.")


--- Generating predictions for the PIPELINE's ASC stage... ---
  Processing Pipeline ASC Fold 1/5...
    ✅ Found best model via nested trainer_state.json: checkpoint-411


  Processing Pipeline ASC Fold 2/5...
    ✅ Found best model via nested trainer_state.json: checkpoint-685


  Processing Pipeline ASC Fold 3/5...
    ✅ Found best model via nested trainer_state.json: checkpoint-274


  Processing Pipeline ASC Fold 4/5...
    ✅ Found best model via nested trainer_state.json: checkpoint-685


  Processing Pipeline ASC Fold 5/5...
    ✅ Found best model via nested trainer_state.json: checkpoint-685



✅ Pipeline ASC prediction generation complete.


In [5]:
# Cell 4: Create Initial Analysis DataFrame

print("--- Combining all data into a single analysis DataFrame ---")
df_analysis = pd.DataFrame({
    'true_aspect': all_predictions['y_true'],
    'pred_aspect_pipeline': all_predictions['indobertweet_pipeline_acd']
})

# Add the true sentiment and the newly generated pipeline ASC predictions
df_analysis['true_sentiment'] = df['sentiment']
df_analysis['pred_sentiment_pipeline'] = final_asc_preds

print("✅ Initial df_analysis created with ground truth and pipeline predictions.")
display(df_analysis.head())

--- Combining all data into a single analysis DataFrame ---
✅ Initial df_analysis created with ground truth and pipeline predictions.


Unnamed: 0,true_aspect,pred_aspect_pipeline,true_sentiment,pred_sentiment_pipeline
0,Irrelevant,Irrelevant,,
1,Irrelevant,Smart Economy,,
2,Irrelevant,Irrelevant,,
3,Irrelevant,Irrelevant,,
4,Irrelevant,Irrelevant,,


In [13]:
# Cell 5: Generate End-to-End Predictions for the Best Joint Model (MTL) - FINAL CORRECTED VERSION

import torch
from torch import nn
from transformers import BertPreTrainedModel, BertModel, AutoTokenizer, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
import pandas as pd
import numpy as np
import os
import json

print("\n--- Generating End-to-End predictions for the Joint (MTL) model ---")

# --- Full class definitions must be included ---
class IndoBERT_MTL(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.aspect_classifier = nn.Linear(config.hidden_size, config.num_aspect_labels)
        self.sentiment_classifier = nn.Linear(config.hidden_size, config.num_sentiment_labels)
        self.init_weights()
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, aspect_labels=None, sentiment_labels=None, return_dict=None, **kwargs):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=return_dict)
        pooled_output = outputs[1]; pooled_output = self.dropout(pooled_output)
        aspect_logits = self.aspect_classifier(pooled_output); sentiment_logits = self.sentiment_classifier(pooled_output)
        if aspect_labels is None:
            return SequenceClassifierOutput(loss=None, logits=(aspect_logits, sentiment_logits), hidden_states=outputs.hidden_states, attentions=outputs.attentions)
        else:
            total_loss = 0; loss_fct = nn.CrossEntropyLoss()
            total_loss += loss_fct(aspect_logits.view(-1, self.config.num_aspect_labels), aspect_labels.view(-1))
            total_loss += loss_fct(sentiment_logits.view(-1, self.config.num_sentiment_labels), sentiment_labels.view(-1))
            return SequenceClassifierOutput(loss=total_loss, logits=(aspect_logits, sentiment_logits), hidden_states=outputs.hidden_states, attentions=outputs.attentions)

class PredictionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings): self.encodings = encodings
    def __getitem__(self, idx): return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self): return len(self.encodings.input_ids)

# --- Setup for MTL Prediction ---
MTL_MODEL_NAME = "indobenchmark/indobert-base-p1"
MTL_RESULTS_PATH = os.path.join(GDRIVE_PATH, 'mtl_indobert_results/')
TEXT_COLUMN = 'full_text' if 'full_text' in df.columns else 'cleaned_text'

# --- [!!! TINDAKAN DIPERLUKAN !!!] ---
# GANTI BAGIAN INI DENGAN KAMUS LABEL ASPEK YANG BENAR DARI SCRIPT TRAINING ANDA
# Kamus dinamis (sorted) yang menyebabkan masalah sudah saya beri komentar.

# # (DO NOT USE THIS DYNAMIC MAPPING - It causes the error)
# aspect_labels_list = sorted(df['aspect'].unique())
# a_label2id = {l: i for i, l in enumerate(aspect_labels_list)}
# a_id2label = {i: l for i, l in enumerate(aspect_labels_list)}

# GANTI DENGAN KAMUS ID->LABEL (ANGKA KE STRING) YANG SESUAI DARI TRAINING SCRIPT ANDA
# INI HANYA CONTOH, GUNAKAN MILIK ANDA:
a_id2label_correct = {
    0: 'Smart Economy', 1: 'Smart Environment', 2: 'Smart Governance',
    3: 'Smart Living', 4: 'Smart Mobility', 5: 'Smart People'
}

print("Correct Aspect Label Mapping has been created:")
print(a_id2label_correct)
# --- [AKHIR DARI BAGIAN YANG PERLU DIUBAH] ---


# Sentiment mapping can remain dynamic as it was working correctly
sentiment_labels_list_all = sorted(df[df['aspect'] != 'Irrelevant']['sentiment'].unique())
s_id2label_all = {i: l for i, l in enumerate(sentiment_labels_list_all)}

tokenizer_mtl = AutoTokenizer.from_pretrained(MTL_MODEL_NAME)
mtl_aspect_preds = pd.Series([None] * len(df), index=df.index)
mtl_sentiment_preds = pd.Series([None] * len(df), index=df.index)

for i, fold_dict in enumerate(kfold_splits):
    fold_num = i + 1
    print(f"  Processing MTL Fold {fold_num}/5...")
    test_index = fold_dict['test']
    fold_dir = os.path.join(MTL_RESULTS_PATH, f'fold_{fold_num}')
    try:
        possible_checkpoints = [d for d in os.listdir(fold_dir) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(fold_dir, d))]
        latest_checkpoint_dir = sorted(possible_checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
        state_path = os.path.join(fold_dir, latest_checkpoint_dir, 'trainer_state.json')
        with open(state_path, 'r') as f: state = json.load(f)
        saved_best_path = state['best_model_checkpoint']
        best_checkpoint_name = os.path.basename(saved_best_path)
        best_checkpoint_path = os.path.join(fold_dir, best_checkpoint_name)
        print(f"    ✅ Found best model via trainer_state.json: {os.path.basename(best_checkpoint_path)}")
    except Exception as e:
        print(f"    ⚠️ Warning: Could not determine best model from trainer_state.json (Reason: {e}). Using latest checkpoint as fallback.")
        possible_checkpoints = [d for d in os.listdir(fold_dir) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(fold_dir, d))]
        best_checkpoint_path = os.path.join(fold_dir, sorted(possible_checkpoints, key=lambda x: int(x.split('-')[-1]))[-1])

    model_mtl = IndoBERT_MTL.from_pretrained(best_checkpoint_path)
    trainer_mtl = Trainer(model=model_mtl)
    test_data = df.iloc[test_index]
    test_encodings = tokenizer_mtl(list(test_data[TEXT_COLUMN]), truncation=True, padding=True, max_length=128)
    prediction_dataset = PredictionDataset(test_encodings)
    predictions = trainer_mtl.predict(prediction_dataset)
    aspect_logits, sentiment_logits = predictions.predictions
    predicted_aspects_int = np.argmax(aspect_logits, axis=1)
    predicted_sentiments_int = np.argmax(sentiment_logits, axis=1)
    mtl_aspect_preds.iloc[test_index] = predicted_aspects_int
    mtl_sentiment_preds.iloc[test_index] = predicted_sentiments_int

# --- Pastikan mapping menggunakan kamus yang benar ---
final_mtl_aspect_preds = mtl_aspect_preds.map(a_id2label_correct)
final_mtl_sentiment_preds = mtl_sentiment_preds.map(s_id2label_all)

df_analysis['pred_aspect_mtl'] = final_mtl_aspect_preds
df_analysis['pred_sentiment_mtl'] = final_mtl_sentiment_preds

print("\n✅ MTL end-to-end prediction generation complete.")


--- Generating End-to-End predictions for the Joint (MTL) model ---
Correct Aspect Label Mapping has been created:
{0: 'Smart Economy', 1: 'Smart Environment', 2: 'Smart Governance', 3: 'Smart Living', 4: 'Smart Mobility', 5: 'Smart People'}
  Processing MTL Fold 1/5...
    ✅ Found best model via trainer_state.json: checkpoint-510


  Processing MTL Fold 2/5...
    ✅ Found best model via trainer_state.json: checkpoint-408


  Processing MTL Fold 3/5...
    ✅ Found best model via trainer_state.json: checkpoint-408


  Processing MTL Fold 4/5...
    ✅ Found best model via trainer_state.json: checkpoint-408


  Processing MTL Fold 5/5...
    ✅ Found best model via trainer_state.json: checkpoint-408



✅ MTL end-to-end prediction generation complete.


In [14]:
# Cell 6: Final Deep-Dive Investigation and Analysis

print("--- Starting Deep-Dive Investigation of the Anomaly ---")
# STEP 1: Isolate and clean data
print("\n[STEP 1] Isolating and Cleaning Data...")
df_investigation = df_analysis[df_analysis['true_aspect'] != 'Irrelevant'].copy()
df_investigation['true_aspect_cleaned'] = df_investigation['true_aspect'].str.strip()
df_investigation['true_sentiment_cleaned'] = df_investigation['true_sentiment'].str.strip()
df_investigation['pred_aspect_mtl_cleaned'] = df_investigation['pred_aspect_mtl'].str.strip()
df_investigation['pred_sentiment_mtl_cleaned'] = df_investigation['pred_sentiment_mtl'].str.strip()
df_investigation['pred_aspect_pipeline_cleaned'] = df_investigation['pred_aspect_pipeline'].str.strip()
df_investigation['pred_sentiment_pipeline_cleaned'] = df_investigation['pred_sentiment_pipeline'].str.strip()
print("✅ All ground truth and prediction columns have been cleaned.")

# STEP 2: Perform separate comparisons
print("\n[STEP 2] Performing Separate Task Comparisons...")
df_investigation['debug_mtl_aspect_match'] = (df_investigation['pred_aspect_mtl_cleaned'] == df_investigation['true_aspect_cleaned'])
df_investigation['debug_mtl_sentiment_match'] = (df_investigation['pred_sentiment_mtl_cleaned'] == df_investigation['true_sentiment_cleaned'])
df_investigation['debug_pipeline_aspect_match'] = (df_investigation['pred_aspect_pipeline_cleaned'] == df_investigation['true_aspect_cleaned'])
df_investigation['debug_pipeline_sentiment_match'] = (df_investigation['pred_sentiment_pipeline_cleaned'] == df_investigation['true_sentiment_cleaned'])
print("✅ Debug columns created.")

# STEP 3: Calculate per-task accuracy
print("\n--- [DIAGNOSIS] Per-Task Accuracy ---")
mtl_aspect_acc = df_investigation['debug_mtl_aspect_match'].mean()
mtl_sentiment_acc = df_investigation['debug_mtl_sentiment_match'].mean()
pipeline_aspect_acc = df_investigation['debug_pipeline_aspect_match'].mean()
pipeline_sentiment_acc = df_investigation['debug_pipeline_sentiment_match'].mean()
print(f"MTL Model -> Aspect Accuracy:      {mtl_aspect_acc:.2%}")
print(f"MTL Model -> Sentiment Accuracy:   {mtl_sentiment_acc:.2%}")
print("-" * 30)
print(f"Pipeline Model -> Aspect Accuracy:   {pipeline_aspect_acc:.2%}")
print(f"Pipeline Model -> Sentiment Accuracy:{pipeline_sentiment_acc:.2%}")

# STEP 4: Recalculate Final End-to-End Accuracy & 'Lucky Guesses'
print("\n--- [FINAL CALCULATION] End-to-End Accuracy ---")
df_investigation['e2e_correct_mtl'] = df_investigation['debug_mtl_aspect_match'] & df_investigation['debug_mtl_sentiment_match']
df_investigation['e2e_correct_pipeline'] = df_investigation['debug_pipeline_aspect_match'] & df_investigation['debug_pipeline_sentiment_match']
e2e_accuracy_mtl = df_investigation['e2e_correct_mtl'].mean()
e2e_accuracy_pipeline = df_investigation['e2e_correct_pipeline'].mean()
performance_drop = e2e_accuracy_mtl - e2e_accuracy_pipeline
print(f"  - Joint Model (IndoBERT MTL):      {e2e_accuracy_mtl:.2%}")
print(f"  - Pipeline Model (IndoBERTweet):   {e2e_accuracy_pipeline:.2%}")
print(f"  --------------------------------------------------")
print(f"  ==> Performance Drop of Pipeline vs. Joint Model: {performance_drop:.2%}")

print("\n--- [FINAL CALCULATION] Pipeline's 'Lucky Guesses' ---")
# Create the pipeline correctness columns in the investigation df
df_investigation['acd_correct'] = df_investigation['pred_aspect_pipeline_cleaned'] == df_investigation['true_aspect_cleaned']
df_investigation['asc_correct'] = df_investigation['pred_sentiment_pipeline_cleaned'] == df_investigation['true_sentiment_cleaned']
acd_wrong_df = df_investigation[df_investigation['acd_correct'] == False]
lucky_guesses_count = acd_wrong_df['asc_correct'].sum()
total_acd_errors = len(acd_wrong_df)
if total_acd_errors > 0:
    lucky_guess_percentage = (lucky_guesses_count / total_acd_errors)
    print(f"  ==> Percentage of 'lucky guesses' (correct ASC despite wrong ACD): {lucky_guess_percentage:.2%}")
else:
    print("No ACD errors were found to analyze.")

--- Starting Deep-Dive Investigation of the Anomaly ---

[STEP 1] Isolating and Cleaning Data...
✅ All ground truth and prediction columns have been cleaned.

[STEP 2] Performing Separate Task Comparisons...
✅ Debug columns created.

--- [DIAGNOSIS] Per-Task Accuracy ---
MTL Model -> Aspect Accuracy:      95.48%
MTL Model -> Sentiment Accuracy:   94.85%
------------------------------
Pipeline Model -> Aspect Accuracy:   76.98%
Pipeline Model -> Sentiment Accuracy:83.75%

--- [FINAL CALCULATION] End-to-End Accuracy ---
  - Joint Model (IndoBERT MTL):      91.26%
  - Pipeline Model (IndoBERTweet):   64.56%
  --------------------------------------------------
  ==> Performance Drop of Pipeline vs. Joint Model: 26.71%

--- [FINAL CALCULATION] Pipeline's 'Lucky Guesses' ---
  ==> Percentage of 'lucky guesses' (correct ASC despite wrong ACD): 83.37%
