In [2]:
import json
import os
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch.nn.functional as F

# 모델 설정
model_name = "gauneg/deberta-v3-base-absa-ate-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 레이블 맵 정의 (모델에 따라 상이할 수 있음)
labels = {
    "B-neu": 1,
    "I-neu": 2,
    "O": 0,
    "B-neg": 3,
    "B-con": 4,
    "I-pos": 5,
    "B-pos": 6,
    "I-con": 7,
    "I-neg": 8,
    "X": -100,
}
id2label = {v: k for k, v in labels.items()}


# === ABSA 추출 함수 ===
def extract_aspects(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(
        device
    )
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1).squeeze().tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels_pred = [id2label.get(p, "O") for p in predictions]

    aspects, current, sentiment, scores = [], [], None, []
    for i, (token, label) in enumerate(zip(tokens, labels_pred)):
        if label.startswith("B-"):
            if current:
                avg_score = sum(scores) / len(scores) if scores else 0.0
                aspects.append(("".join(current), sentiment, round(avg_score, 4)))
            current = [token.replace("▁", " ") if token.startswith("▁") else token]
            sentiment = label.split("-")[1]
            scores = [probs[0, i, labels[f"B-{sentiment}"]].item()]
        elif label.startswith("I-") and current:
            current.append(token.replace("▁", " ") if token.startswith("▁") else token)
            scores.append(probs[0, i, labels.get(f"I-{sentiment}", 0)].item())
        else:
            if current:
                avg_score = sum(scores) / len(scores) if scores else 0.0
                aspects.append(("".join(current), sentiment, round(avg_score, 4)))
                current, sentiment, scores = [], None, []

    if current:
        avg_score = sum(scores) / len(scores) if scores else 0.0
        aspects.append(("".join(current), sentiment, round(avg_score, 4)))

    clean_aspects = []
    for term, sent, score in aspects:
        term = term.strip()
        if term:
            clean_aspects.append(
                {"term": term, "sentiment": sent, "confidence": score}  # 감성 점수
            )
    return clean_aspects

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
input_file = "review_5up.json"  # 줄 단위 JSON (user_id, business_id, text 포함)
output_file = "absa_ate_results2.jsonl"  # 결과 저장 파일

# 중복 처리 방지용 (이미 처리된 텍스트)
processed = set()
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                item = json.loads(line)
                processed.add(item["text"])
            except:
                continue

with open(input_file, "r", encoding="utf-8") as f_in, open(
    output_file, "a", encoding="utf-8"
) as f_out:
    for line in tqdm(f_in, desc="ABSA 추출 중"):
        try:
            item = json.loads(line)
            if item["text"] in processed:
                continue
            aspects = extract_aspects(item["text"])
            result = {
                "review_id": item.get("review_id"),
                "user_id": item.get("user_id"),
                "business_id": item.get("business_id"),
                "text": item["text"],
                "aspects": aspects,
            }
            f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
        except:
            continue

ABSA 추출 중: 45546it [11:43, 22.98it/s]    