In [1]:
pip install transformers torch




In [3]:
import pandas as pd
from transformers import pipeline

# 1. 读取数据
df = pd.read_csv("/content/tracks_with_lyrics_cleaned.csv")

# 保证歌词存在且长度不太离谱
df = df[df["clean_lyrics"].notna()]
df = df[df["clean_lyrics"].str.len() > 20].reset_index(drop=True)

# 2. 初始化 zero-shot 分类模型
emotion_labels = ["happy", "sad", "angry", "relaxed", "energetic"]

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

def get_emotion_scores(text):
    """
    对一段歌词做多情绪分类，返回：
    - 每个情绪的概率（按 emotion_labels 顺序）
    - 最主要情绪的 label
    """
    # 截断一下太长的歌词，防止超长（可以按需要调）
    text = text[:1000]

    result = classifier(
        text,
        candidate_labels=emotion_labels,
        multi_label=False  # 这里我们让它选“最像的一个”，也会返回其他的分数
    )

    # result["labels"] 是按分数从高到低排序的 label 列表
    # result["scores"] 是对应的分数
    label_to_score = {lab: sc for lab, sc in zip(result["labels"], result["scores"])}

    scores_in_order = [label_to_score[lab] for lab in emotion_labels]
    pred_emotion = result["labels"][0]  # 概率最高的那个

    return scores_in_order, pred_emotion

# 3. 对每一首歌打标签
score_cols = [f"score_{lab}" for lab in emotion_labels]
all_scores = []
pred_emotions = []

for i, row in df.iterrows():
    lyrics = row["clean_lyrics"]
    scores, pred = get_emotion_scores(lyrics)
    all_scores.append(scores)
    pred_emotions.append(pred)

    if (i + 1) % 20 == 0:
        print(f"Processed {i+1} tracks")

# 把得分拆成列
scores_df = pd.DataFrame(all_scores, columns=score_cols)
df = pd.concat([df.reset_index(drop=True), scores_df], axis=1)
df["pred_emotion"] = pred_emotions

# 4. 保存结果
df.to_csv("tracks_with_emotions.csv", index=False)
print("Saved tracks_with_emotions.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 20 tracks
Processed 40 tracks
Processed 60 tracks
Processed 80 tracks
Processed 100 tracks
Processed 120 tracks
Processed 140 tracks
Processed 160 tracks
Processed 180 tracks
Processed 200 tracks
Processed 220 tracks
Processed 240 tracks
Processed 260 tracks
Processed 280 tracks
Processed 300 tracks
Processed 320 tracks
Processed 340 tracks
Processed 360 tracks
Processed 380 tracks
Processed 400 tracks
Processed 420 tracks
Processed 440 tracks
Processed 460 tracks
Processed 480 tracks
Processed 500 tracks
Processed 520 tracks
Processed 540 tracks
Processed 560 tracks
Processed 580 tracks
Processed 600 tracks
Processed 620 tracks
Processed 640 tracks
Processed 660 tracks
Processed 680 tracks
Processed 700 tracks
Processed 720 tracks
Processed 740 tracks
Processed 760 tracks
Processed 780 tracks
Processed 800 tracks
Processed 820 tracks
Processed 840 tracks
Processed 860 tracks
Processed 880 tracks
Processed 900 tracks
Processed 920 tracks
Saved tracks_with_emotions.csv


In [4]:
import random

# 重新读刚刚保存的带情绪的文件
df_emotion = pd.read_csv("tracks_with_emotions.csv")

emotion_labels = ["happy", "sad", "angry", "relaxed", "energetic"]
score_cols = [f"score_{lab}" for lab in emotion_labels]

# 英文情绪 -> 中文标签
emotion_zh = {
    "happy": "开心 / 积极",
    "sad": "伤感",
    "angry": "愤怒 / 攻击",
    "relaxed": "放松 / 慢节奏",
    "energetic": "热血 / 派对"
}

def sample_playlist_with_mood(df, k_min=10, k_max=20, random_state=None):
    if random_state is not None:
        random.seed(random_state)

    k = random.randint(k_min, k_max)
    playlist = df.sample(n=k, replace=False, random_state=random_state)

    # 对每种情绪的分数做平均
    mean_scores = playlist[score_cols].mean()

    # 找到平均得分最高的情绪
    dominant_emotion_en = mean_scores.idxmax().replace("score_", "")
    dominant_emotion_zh = emotion_zh[dominant_emotion_en]

    # 也可以把每种情绪的平均分打印出来，方便你调参
    emotion_score_dict = {
        lab: mean_scores[f"score_{lab}"] for lab in emotion_labels
    }

    return playlist, dominant_emotion_en, dominant_emotion_zh, emotion_score_dict

# 举个例子：抽一个歌单
playlist, mood_en, mood_zh, mood_scores = sample_playlist_with_mood(
    df_emotion,
    k_min=10,
    k_max=20,
    random_state=42
)

print(f"本次歌单歌曲数量: {len(playlist)}")
print("歌单情绪（英文）:", mood_en)
print("歌单情绪（中文）:", mood_zh)
print("各情绪平均分：")
for k, v in mood_scores.items():
    print(f"  {k:10s}: {v:.3f}")

print("\n歌单歌曲预览：")
print(playlist[["track_name", "artist_name", "pred_emotion"]].head())


本次歌单歌曲数量: 20
歌单情绪（英文）: energetic
歌单情绪（中文）: 热血 / 派对
各情绪平均分：
  happy     : 0.072
  sad       : 0.216
  angry     : 0.248
  relaxed   : 0.096
  energetic : 0.368

歌单歌曲预览：
                track_name      artist_name pred_emotion
299            Billie Jean  Michael Jackson    energetic
63                    Work          Rihanna          sad
136               Magnolia    Playboi Carti    energetic
597              It's Time  Imagine Dragons    energetic
261  Drop It Like It's Hot       Snoop Dogg    energetic
