<a href="https://colab.research.google.com/github/jinju-yang/DACOS_NLP/blob/main/POP_lyrics_emotion_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install deep-translator
from deep_translator import GoogleTranslator
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pandas as pd

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
translator = GoogleTranslator(source='ko', target='en')

#GoEmotions 모델 사용
model_name = "monologg/bert-base-cased-goemotions-original"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

emotion_analyzer = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

# 표제어 추출 및 불용어 리스트
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stopwords = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/182 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def preprocess_lyrics_english(lyric):
    lyric = re.sub(r'<.*?>', '', lyric)  # HTML 태그 제거
    lyric = re.sub(r'http\S+|www\S+', '', lyric)  # URL 제거
    lyric = re.sub(r'[^\w\s]', '', lyric)  # 특수 문자 제거
    lyric = re.sub(r'\d+', '', lyric)  # 숫자 제거

    lyric = lyric.lower()

    # 3. 공백 기준 토큰화
    tokens = lyric.split()

    processed_tokens = []
    for token in tokens:
        if token not in stopwords:
            lemma = lemmatizer.lemmatize(token)
            processed_tokens.append(lemma)
    return " ".join(processed_tokens)

# CSV 파일 불러오기
df = pd.read_csv("melon_pop_lyrics_1_200.csv")

# '곡 제목', '가수', '가사' 컬럼만 사용한다고 가정
df = df[['곡 제목', '가수', '가사']]

df['감정'] = ''
df['score'] = ''

In [None]:
for idx, (song_name, artist, lyric) in enumerate(zip(df['곡 제목'], df['가수'], df['가사'])):
    if pd.isna(lyric):
        print(f"곡 제목: {song_name}의 가사가 비어 있습니다. 건너뜁니다.")
        continue  # NaN일 경우 이 항목을 건너뜁니다.

    # 한국어 가사 -> 영어 번역
    translated_lyric = translator.translate(lyric)

    # 영어로 번역된 가사 전처리
    processed_lyric = preprocess_lyrics_english(translated_lyric)

    # 감정 분석
    result = emotion_analyzer(processed_lyric)

    # 감정 분석 결과 처리
    high_confidence_emotions = [r for r in result[0] if r['score'] >= 0.9]
    if high_confidence_emotions:
        top_emotion = max(high_confidence_emotions, key=lambda x: x['score'])
        emotion = top_emotion['label']
        score = top_emotion['score']
    else:
        sorted_emotions = sorted(result[0], key=lambda x: x['score'], reverse=True)
        top_emotions = sorted_emotions[:2]
        emotion = ", ".join([e['label'] for e in top_emotions])
        score = ", ".join([str(e['score']) for e in top_emotions])

    # 중간 결과 출력 (선택 사항)
    print(f"곡 제목: {song_name}, 감정: {emotion}, score: {score}")

    # 결과를 데이터프레임에 저장
    df.at[idx, '감정'] = emotion
    df.at[idx, 'score'] = score


곡 제목: I Don't Think That I Like Her, 감정: neutral, disappointment, score: 0.8590247631072998, 0.06776775419712067
곡 제목: STAY, 감정: fear, score: 0.9826462268829346
곡 제목: Dangerously, 감정: love, score: 0.9961769580841064
곡 제목: Steal The Show (From "Elemental"), 감정: neutral, score: 0.9984843134880066
곡 제목: 2002, 감정: love, score: 0.954855740070343
곡 제목: Die With A Smile, 감정: neutral, love, score: 0.810812771320343, 0.1854088008403778
곡 제목: Off My Face, 감정: love, score: 0.9906241297721863
곡 제목: Love Yourself, 감정: love, score: 0.9995469450950623
곡 제목: Older, 감정: love, disapproval, score: 0.8511183261871338, 0.10517122596502304
곡 제목: pocket locket, 감정: love, score: 0.9974852800369263
곡 제목: Shape of You, 감정: love, score: 0.9997206330299377
곡 제목: Cruel Summer, 감정: anger, neutral, score: 0.7263177037239075, 0.10595846176147461
곡 제목: Touch, 감정: love, score: 0.9998399019241333
곡 제목: When I Get Old, 감정: fear, score: 0.9862307906150818
곡 제목: Memories, 감정: neutral, score: 0.9995562434196472
곡 제목: Beauti

In [None]:
df.to_csv("melon_pop_lyrics_with_emotions.csv", index=False)