In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch torchvision torchaudio --quiet
!pip install transformers sentencepiece --quiet

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

print("Device:", "cuda" if torch.cuda.is_available() else "cpu")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Inference device:", DEVICE)


In [None]:
MODEL_NAME = "monologg/koelectra-small-finetuned-nsmc"

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

def sentiment_score(text, max_length=64):
    if not isinstance(text, str) or not text.strip():
        return {
            "label": None,
            "label_name": None,
            "pos_prob": None,
            "score_pm1": None,
        }

    inputs = tok(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]

    pos_prob = float(probs[1])          # 긍정 확률
    label = int(np.argmax(probs))       # 0 or 1
    label_name = "positive" if label == 1 else "negative"

    score_pm1 = 2 * pos_prob - 1        # -1 ~ +1

    return {
        "label": label,
        "label_name": label_name,
        "pos_prob": pos_prob,
        "score_pm1": score_pm1,
    }

In [None]:
import pandas as pd
import os
import glob
from tqdm import tqdm

INPUT_DIR = "/content/drive/My Drive/project/리뷰DB"
OUTPUT_DIR = "/content/drive/My Drive/project/리뷰DB_KoELECTRA"

REVIEW_COL = "review_text"

def process_review_db_sentiment(input_dir, output_dir, text_col):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

    if not csv_files:
        print(" 처리할 CSV 파일이 없습니다. 경로를 확인해주세요.")
        return

    print(f"  총 {len(csv_files)}개의 파일을 처리합니다.\n")

    for file_path in tqdm(csv_files, desc="Processing"):
        fname = os.path.basename(file_path)
        save_path = os.path.join(output_dir, fname)

        try:
            df = pd.read_csv(file_path)

            if text_col not in df.columns:
                df.to_csv(save_path, index=False, encoding="utf-8-sig")
                continue

            # 감성 분석 적용
            results = df[text_col].fillna("").astype(str).apply(lambda x: sentiment_score(x))

            # 결과 분리하여 새 컬럼에 저장
            df["sent_label"]      = results.apply(lambda x: x["label"])       # 0(부정), 1(긍정)
            df["sent_label_name"] = results.apply(lambda x: x["label_name"])  # 'negative', 'positive'
            df["sent_pos_prob"]   = results.apply(lambda x: x["pos_prob"])    # 긍정 확률 (0~1)
            df["sent_score_pm1"]  = results.apply(lambda x: x["score_pm1"])   # 감성 점수 (-1~1)

            df.to_csv(save_path, index=False, encoding="utf-8-sig")

        except Exception as e:
            print(f" [Error] {fname}: {e}")

# 실행
if __name__ == "__main__":
    process_review_db_sentiment(INPUT_DIR, OUTPUT_DIR, REVIEW_COL)