In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from torch.nn.functional import softmax
from tqdm import tqdm

# Demo

In [3]:
# 1. 모델과 토크나이저 로딩
model_name = "monologg/bert-base-cased-goemotions-original"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 2. 예시 텍스트
text = "I'm really disappointed with the way things turned out."

# 3. 토큰화 및 모델 예측
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)

# 4. 감정 라벨
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# 5. Result
result = {emotion_labels[i]: float(probs[0][i]) for i in range(len(emotion_labels))}
sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
for emotion, score in sorted_result[:5]:
    print(f"{emotion}: {score:.3f}")

tokenizer_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/182 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

disappointment: 0.990
neutral: 0.014
admiration: 0.013
realization: 0.009
sadness: 0.008


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

# Sil Jun

In [2]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [3]:
# 1. 모델과 토크나이저 로딩
model_name = "monologg/bert-base-cased-goemotions-original"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to("cuda")
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [22]:
# 2. 감성 라벨 정의 
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
    'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
    'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]


# 3. 감성 분석 함수 (확률 분포 반환)
def get_emotion_distributions(texts, batch_size=16):
    all_probs = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        batch_texts = [str(t) for t in batch_texts]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1).cpu().numpy()
            all_probs.extend(probs)

    return pd.DataFrame(all_probs, columns=emotion_labels)

# 4. 데이터 처리
def process_dataset(input_path, output_path):
    df = pd.read_csv(input_path)
    emotion_probs_df = get_emotion_distributions(df['text_clean'].tolist())
    result_df = pd.concat([df, emotion_probs_df], axis=1)
    result_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✅ 저장 완료: {output_path}")

In [24]:
process_dataset('./ready/nyt_vaccine_ready.csv', './ready/nyt_vaccine_emotions.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 261/261 [00:02<00:00, 96.91it/s]


✅ 저장 완료: ./ready/nyt_vaccine_emotions.csv


In [25]:
process_dataset('./ready/guardian_vaccine_ready.csv', './ready/guardian_vaccine_emotions.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 908/908 [03:06<00:00,  4.87it/s]


✅ 저장 완료: ./ready/guardian_vaccine_emotions.csv


In [23]:
process_dataset('./ready/reddit_vaccine_ready.csv', './ready/reddit_vaccine_emotions.csv')

100%|████████████████████████████████████████████████████████████████████████████████| 219/219 [00:37<00:00,  5.82it/s]


✅ 저장 완료: ./ready/reddit_vaccine_emotions.csv
