In [1]:
import json
import torch
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SAVED_MODEL_NER = "../model/NER/deberta_ner_saved"
SAVED_MODEL_ABSA = "../model/absa/deberta_absa_saved"
ABSA_CSV_PATH = "absa_flat.csv"
TEST_TEXT_PATH = "../data/test_text.json"
OUTPUT_JSON_PATH = "ner_absa_predictions.json"
MAX_LENGTH = 256
FIXED_ASPECTS = [
    "acting",
    "plot",
    "characters",
    "cinematography",
    "script",
    "music",
    "ending",
]

In [3]:
ner_pipeline = pipeline(
    "token-classification",
    model=SAVED_MODEL_NER,
    tokenizer=SAVED_MODEL_NER,
    aggregation_strategy="simple"
)

Device set to use cpu


In [4]:
absa_tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_ABSA)
absa_model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL_ABSA)
absa_model.eval()


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [5]:
def build_id2fixed_aspects(csv_path):
    df = pd.read_csv(csv_path)
    allowed = {a.lower() for a in FIXED_ASPECTS}
    id2aspects = {}

    grouped = df.groupby("id")
    for review_id, group in grouped:
        aspek_list = []
        seen = set()
        for asp in group["aspect"]:
            asp_lower = str(asp).lower()
            if asp_lower in allowed and asp_lower not in seen:
                aspek_list.append(asp)
                seen.add(asp_lower)
        id2aspects[review_id] = aspek_list

    return id2aspects

ID2FIXED_ASPECTS = build_id2fixed_aspects(ABSA_CSV_PATH)

def load_test_data(text_file):
    with open(text_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return {item["id"]: item["text"] for item in data["reviews"]}

# ================== NER + ABSA ==================

def extract_actors(ner_results):
    actors = [
        ent["word"]
        for ent in ner_results
        if ent["entity_group"].upper() == "ACTOR"
    ]
    seen = set()
    uniq = []
    for a in actors:
        if a not in seen:
            uniq.append(a)
            seen.add(a)
    return uniq

def extract_fixed_aspects(review_id):
    return ID2FIXED_ASPECTS.get(review_id, [])

def predict_absa(text, aspect):
    encoded = absa_tokenizer(
        aspect,
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = absa_model(**encoded)
        pred_id = torch.argmax(output.logits, dim=-1).item()

    return absa_model.config.id2label[pred_id]

def normalize_ner_results(ner_results):
    """Convert seluruh nilai hasil NER ke tipe Python native."""
    normalized = []
    for ent in ner_results:
        new_ent = {}
        for k, v in ent.items():
            # convert numpy types → python native
            if hasattr(v, "item"):  
                v = v.item()
            new_ent[k] = v
        normalized.append(new_ent)
    return normalized

def run_ner_absa(review_id, text):
    ner_results = ner_pipeline(text)
    ner_results = normalize_ner_results(ner_results)   # <--- ADD THIS

    actors = extract_actors(ner_results)
    fixed_aspects = extract_fixed_aspects(review_id)
    all_aspects = actors + fixed_aspects

    absa_outputs = [
        {
            "aspect": aspect,
            "sentiment": predict_absa(text, aspect)
        }
        for aspect in all_aspects
    ]

    return {
        "id": review_id,
        "text": text,
        "ner_results": ner_results,
        "actors": actors,
        "fixed_aspects": fixed_aspects,
        "absa_predictions": absa_outputs,
    }


In [None]:
if __name__ == "__main__":
    test_texts = load_test_data(TEST_TEXT_PATH)

    all_outputs_for_json = []  # list of dict tanpa text

    print("===== Combined NER + ABSA (Test Set) =====\n")

    for review_id, text in test_texts.items():
        result = run_ner_absa(review_id, text)

        # versi bersih tanpa text untuk print & simpan
        clean_result = {
            "id": result["id"],
            "ner_results": result["ner_results"],
            "actors": result["actors"],
            "fixed_aspects": result["fixed_aspects"],
            "absa_predictions": result["absa_predictions"],
        }

        # pretty print
        print(f"ID: {review_id}")
        print(json.dumps(clean_result, indent=2, ensure_ascii=False))
        print("-" * 60)

        all_outputs_for_json.append(clean_result)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


===== Combined NER + ABSA (Test Set) =====

ID: pos_601
{
  "id": "pos_601",
  "ner_results": [
    {
      "entity_group": "ACTOR",
      "score": 0.9850327372550964,
      "word": "Michael Pollard",
      "start": 279,
      "end": 295
    }
  ],
  "actors": [
    "Michael Pollard"
  ],
  "fixed_aspects": [
    "plot",
    "cinematography",
    "acting"
  ],
  "absa_predictions": [
    {
      "aspect": "Michael Pollard",
      "sentiment": "positive"
    },
    {
      "aspect": "plot",
      "sentiment": "negative"
    },
    {
      "aspect": "cinematography",
      "sentiment": "positive"
    },
    {
      "aspect": "acting",
      "sentiment": "positive"
    }
  ]
}
------------------------------------------------------------
ID: pos_602
{
  "id": "pos_602",
  "ner_results": [
    {
      "entity_group": "DIRECTOR",
      "score": 0.8917977809906006,
      "word": "Carlos Saura",
      "start": 237,
      "end": 250
    },
    {
      "entity_group": "MOVIE",
      "score": 0.7

NameError: name 'os' is not defined

In [None]:
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(all_outputs_for_json, f, ensure_ascii=False, indent=2)

print(f"\nSaved JSON predictions to: {OUTPUT_JSON_PATH}")


Saved JSON predictions to: ner_absa_predictions.json


# Evaluate

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def evaluate_absa_from_csv(csv_path):
    """
    Evaluasi model ABSA menggunakan file absa_flat.csv
    (kolom: id, text, aspect, sentimen).
    Output: classification_report + confusion_matrix.
    """
    df = pd.read_csv(csv_path)

    label2id = absa_model.config.label2id
    id2label = {v: k for k, v in label2id.items()}
    labels_order = sorted(label2id, key=lambda k: label2id[k])

    y_true = []
    y_pred = []

    for _, row in df.iterrows():
        text = row["text"]
        aspect = row["aspect"]
        gold_label_str = row["sentimen"]

        if gold_label_str not in label2id:
            continue

        gold_id = label2id[gold_label_str]
        pred_label_str = predict_absa(text, aspect)

        if pred_label_str not in label2id:
            continue

        pred_id = label2id[pred_label_str]

        y_true.append(gold_id)
        y_pred.append(pred_id)

    print("\n===== ABSA – Classification Report =====")
    print(classification_report(
        y_true,
        y_pred,
        target_names=labels_order,
        digits=4
    ))

    print("\n===== ABSA – Confusion Matrix (rows = true, cols = pred) =====")
    cm = confusion_matrix(y_true, y_pred)
    print("Label order:", labels_order)
    print(cm)


In [14]:
evaluate_absa_from_csv(ABSA_CSV_PATH)

KeyboardInterrupt: 