In [8]:
import json
import torch
import joblib
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
SAVED_MODEL_NER = "../model/NER/deberta_ner_saved"
SAVED_MODEL_ABSA = "../model/absa/deberta_absa_saved"
SAVED_MODEL_EMOTION = "../model/emotion/saved_classical_model"
ABSA_CSV_PATH = "absa_flat.csv"
TEST_TEXT_PATH = "../data/test_text.json"
OUTPUT_JSON_PATH = "predictions.json"
MAX_LENGTH = 256
FIXED_ASPECTS = [
    "acting",
    "plot",
    "characters",
    "cinematography",
    "script",
    "music",
    "ending",
]
EMOTION_LABELS = ["disgust", "anger", "sadness", "joy", "neutral", "fear", "surprise"]

In [10]:
ner_pipeline = pipeline(
    "token-classification",
    model=SAVED_MODEL_NER,
    tokenizer=SAVED_MODEL_NER,
    aggregation_strategy="simple"
)

Device set to use cuda:0


In [11]:
absa_tokenizer = AutoTokenizer.from_pretrained(SAVED_MODEL_ABSA)
absa_model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL_ABSA)
absa_model.eval()


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [12]:
emotion_model_path = f"{SAVED_MODEL_EMOTION}/classical_model.pkl"
emotion_model = joblib.load(emotion_model_path)
print("Emotion model (SVM) loaded successfully.")

Emotion model (SVM) loaded successfully.


In [13]:
def build_id2fixed_aspects(csv_path):
    df = pd.read_csv(csv_path)
    allowed = {a.lower() for a in FIXED_ASPECTS}
    id2aspects = {}

    grouped = df.groupby("id")
    for review_id, group in grouped:
        aspek_list = []
        seen = set()
        for asp in group["aspect"]:
            asp_lower = str(asp).lower()
            if asp_lower in allowed and asp_lower not in seen:
                aspek_list.append(asp)
                seen.add(asp_lower)
        id2aspects[review_id] = aspek_list

    return id2aspects

ID2FIXED_ASPECTS = build_id2fixed_aspects(ABSA_CSV_PATH)

def load_test_data(text_file):
    with open(text_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return {item["id"]: item["text"] for item in data["reviews"]}

# ================== EMOTION CLASSIFICATION ==================

def predict_emotion(text):
    """Predicts emotion label using the SVM model."""
    pred_id = emotion_model.predict([text])[0]
    return EMOTION_LABELS[pred_id]

# ================== NER + ABSA ==================

def extract_actors(ner_results):
    actors = [
        ent["word"]
        for ent in ner_results
        if ent["entity_group"].upper() == "ACTOR"
    ]
    seen = set()
    uniq = []
    for a in actors:
        if a not in seen:
            uniq.append(a)
            seen.add(a)
    return uniq

def extract_fixed_aspects(review_id):
    return ID2FIXED_ASPECTS.get(review_id, [])

def predict_absa(text, aspect):
    encoded = absa_tokenizer(
        aspect,
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = absa_model(**encoded)
        pred_id = torch.argmax(output.logits, dim=-1).item()

    return absa_model.config.id2label[pred_id]

def normalize_ner_results(ner_results):
    """Convert seluruh nilai hasil NER ke tipe Python native."""
    normalized = []
    for ent in ner_results:
        new_ent = {}
        for k, v in ent.items():
            # convert numpy types → python native
            if hasattr(v, "item"):
                v = v.item()
            new_ent[k] = v
        normalized.append(new_ent)
    return normalized

def run_ner_absa(review_id, text):
    ner_results = ner_pipeline(text)
    ner_results = normalize_ner_results(ner_results)

    actors = extract_actors(ner_results)
    fixed_aspects = extract_fixed_aspects(review_id)
    all_aspects = actors + fixed_aspects

    absa_outputs = [
        {
            "aspect": aspect,
            "sentiment": predict_absa(text, aspect)
        }
        for aspect in all_aspects
    ]

    emotion_label = predict_emotion(text)

    return {
        "id": review_id,
        "text": text,
        "emotion": emotion_label,
        "ner_results": ner_results,
        "actors": actors,
        "fixed_aspects": fixed_aspects,
        "absa_predictions": absa_outputs,
    }


In [14]:
if __name__ == "__main__":
    test_texts = load_test_data(TEST_TEXT_PATH)

    all_outputs_for_json = []

    print("===== Combined NER + ABSA + Emotion (Test Set) =====\n")

    for review_id, text in test_texts.items():
        result = run_ner_absa(review_id, text)
        clean_result = {
            "id": result["id"],
            "emotion": result["emotion"],
            "ner_results": result["ner_results"],
            "actors": result["actors"],
            "fixed_aspects": result["fixed_aspects"],
            "absa_predictions": result["absa_predictions"],
        }

        print(f"ID: {review_id}")
        print(json.dumps(clean_result, indent=2, ensure_ascii=False))
        print("-" * 60)

        all_outputs_for_json.append(clean_result)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


===== Combined NER + ABSA + Emotion (Test Set) =====



ID: pos_601
{
  "id": "pos_601",
  "emotion": "joy",
  "ner_results": [
    {
      "entity_group": "ACTOR",
      "score": 0.9911526441574097,
      "word": "Michael Pollard",
      "start": 279,
      "end": 295
    }
  ],
  "actors": [
    "Michael Pollard"
  ],
  "fixed_aspects": [
    "plot",
    "cinematography",
    "acting"
  ],
  "absa_predictions": [
    {
      "aspect": "Michael Pollard",
      "sentiment": "negative"
    },
    {
      "aspect": "plot",
      "sentiment": "negative"
    },
    {
      "aspect": "cinematography",
      "sentiment": "positive"
    },
    {
      "aspect": "acting",
      "sentiment": "positive"
    }
  ]
}
------------------------------------------------------------
ID: pos_602
{
  "id": "pos_602",
  "emotion": "joy",
  "ner_results": [
    {
      "entity_group": "DIRECTOR",
      "score": 0.9122870564460754,
      "word": "Carlos Saura",
      "start": 237,
      "end": 250
    },
    {
      "entity_group": "MOVIE",
      "score": 0.65275

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


ID: pos_610
{
  "id": "pos_610",
  "emotion": "joy",
  "ner_results": [
    {
      "entity_group": "MOVIE",
      "score": 0.9331985712051392,
      "word": "Stardust",
      "start": 8,
      "end": 17
    },
    {
      "entity_group": "GENRE",
      "score": 0.6543613076210022,
      "word": "fantasy",
      "start": 31,
      "end": 39
    },
    {
      "entity_group": "GENRE",
      "score": 0.5082765221595764,
      "word": "fairytale",
      "start": 229,
      "end": 239
    },
    {
      "entity_group": "CHARACTER",
      "score": 0.8940114378929138,
      "word": "Yvaine",
      "start": 684,
      "end": 691
    },
    {
      "entity_group": "CHARACTER",
      "score": 0.9717828035354614,
      "word": "Victoria",
      "start": 699,
      "end": 708
    },
    {
      "entity_group": "ACTOR",
      "score": 0.9935416579246521,
      "word": "Claire Danes",
      "start": 793,
      "end": 806
    },
    {
      "entity_group": "ACTOR",
      "score": 0.9171909093856812,

In [15]:
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(all_outputs_for_json, f, ensure_ascii=False, indent=2)

print(f"\nSaved JSON predictions to: {OUTPUT_JSON_PATH}")


Saved JSON predictions to: ner_absa_predictions.json


Test With Custom Inpus

In [None]:
custom_text = """
A Man Called Otto was absolutely amazing! Tom Hanks delivered an outstanding performance,
and the cinematography was breathtaking. However, the ending felt rushed and disappointing.
The music score by Hans Zimmer elevated every scene, and the plot kept me engaged throughout.
"""

custom_aspects = ["plot", "cinematography", "ending", "music"]

print("===== Custom Text Prediction =====\n")
print(f"Text: {custom_text.strip()}\n")

ner_results = ner_pipeline(custom_text)
ner_results = normalize_ner_results(ner_results)

print("Detected Entities (from NER):")
for ent in ner_results:
    print(f"  - {ent['word']} ({ent['entity_group']}): score={ent['score']:.4f}")

actors = extract_actors(ner_results)
print(f"\nDetected Actors: {actors}")

all_aspects = actors + custom_aspects
print(f"All Aspects: {all_aspects}\n")

emotion_label = predict_emotion(custom_text)
print(f"Overall Emotion: {emotion_label}\n")

print("ABSA Results:")
absa_results = []
for aspect in all_aspects:
    sentiment = predict_absa(custom_text, aspect)
    absa_results.append({
        "aspect": aspect,
        "sentiment": sentiment
    })
    print(f"  - {aspect}: {sentiment}")

custom_result = {
    "text": custom_text.strip(),
    "emotion": emotion_label,
    "ner_results": ner_results,
    "actors": actors,
    "custom_aspects": custom_aspects,
    "all_aspects": all_aspects,
    "absa_predictions": absa_results
}

print("\n" + "=" * 60)
print("Complete Result:")
print(json.dumps(custom_result, indent=2, ensure_ascii=False))

===== Custom Text Prediction =====

Text: A Man Called Otto was absolutely amazing! Tom Hanks delivered an outstanding performance,
and the cinematography was breathtaking. However, the ending felt rushed and disappointing.
The music score by Hans Zimmer elevated every scene, and the plot kept me engaged throughout.

Detected Entities (from NER):
  - A Man Called Otto (MOVIE): score=0.9898
  - Tom Hanks (ACTOR): score=0.9909

Detected Actors: ['Tom Hanks']
All Aspects: ['Tom Hanks', 'plot', 'cinematography', 'ending', 'music']

Overall Emotion: joy

ABSA Results:


  - Tom Hanks: positive
  - plot: positive
  - cinematography: positive
  - ending: negative
  - music: positive

Complete Result:
{
  "text": "A Man Called Otto was absolutely amazing! Tom Hanks delivered an outstanding performance,\nand the cinematography was breathtaking. However, the ending felt rushed and disappointing.\nThe music score by Hans Zimmer elevated every scene, and the plot kept me engaged throughout.",
  "emotion": "joy",
  "ner_results": [
    {
      "entity_group": "MOVIE",
      "score": 0.989806056022644,
      "word": "A Man Called Otto",
      "start": 1,
      "end": 18
    },
    {
      "entity_group": "ACTOR",
      "score": 0.9908803701400757,
      "word": "Tom Hanks",
      "start": 42,
      "end": 52
    }
  ],
  "actors": [
    "Tom Hanks"
  ],
  "custom_aspects": [
    "plot",
    "cinematography",
    "ending",
    "music"
  ],
  "all_aspects": [
    "Tom Hanks",
    "plot",
    "cinematography",
    "ending",
    "music"
  ],
  "absa_predictions": 