In [2]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [4]:
dir_path = "../data/essentia_results"
num_files = len([f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))])
print(f"Number of files: {num_files}")

Number of files: 999


In [19]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# === Configure paths ===
ESSENTIA_DIR = "../data/essentia_results"

# === PANAS-aligned emotion order
PANAS_ORDER = ["Stressed", "Angry", "Happy", "Sad", "Inspired", "Excited", "Nervous"]

# === Cluster to emotion map (7 clusters → moods)
cluster_to_emotion = {
    0: "Excited",
    1: "Inspired",
    2: "Happy",
    3: "Sad",
    4: "Happy",
    5: "Stressed",
    6: "Nervous"
}

# === Extract enhanced vibe features from JSON ===
def extract_vibe_features(essentia_json):
    low = essentia_json["lowlevel"]
    rhythm = essentia_json.get("rhythm", {})
    feats = {}

    feats["average_loudness"] = low.get("average_loudness", 0)
    feats["dynamic_complexity"] = low.get("dynamic_complexity", 0)

    for key in ["spectral_centroid", "spectral_flux", "spectral_spread", 
                "barkbands_flatness_db", "pitch_salience", "zero_crossing_rate"]:
        val = low.get(key, {})
        feats[f"{key}_mean"] = val.get("mean", 0)

    mfcc = low.get("mfcc", {}).get("mean", [0]*13)
    for i, coeff in enumerate(mfcc):
        feats[f"mfcc_{i+1}"] = coeff

    feats["bpm"] = rhythm.get("bpm", 0)
    feats["onset_rate"] = rhythm.get("onset_rate", 0)

    return feats

# === Load JSONs and extract features ===
all_features = []
track_ids = []

for fname in os.listdir(ESSENTIA_DIR):
    if fname.endswith(".json"):
        with open(os.path.join(ESSENTIA_DIR, fname), encoding="utf-8") as f:
            data = json.load(f)
        track_id = os.path.splitext(fname)[0]
        track_ids.append(track_id)
        all_features.append(extract_vibe_features(data))

df = pd.DataFrame(all_features, index=track_ids)
df.index.name = "track_id"

# === Vibe features used for clustering ===
features = [
    'average_loudness', 'spectral_centroid_mean', 'spectral_flux_mean',
    'barkbands_flatness_db_mean', 'zero_crossing_rate_mean',
    'bpm', 'onset_rate', 'dynamic_complexity'
]

# === Normalize and cluster ===
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
vibe_vectors = df[features].values

num_moods = 7
kmeans = KMeans(n_clusters=num_moods, random_state=42)
df['cluster'] = kmeans.fit_predict(vibe_vectors)

# === Compute mood centroids
mood_centroids = {}
for i, center in enumerate(kmeans.cluster_centers_):
    mood = cluster_to_emotion[i]
    if mood not in mood_centroids:
        mood_centroids[mood] = center

# Assign 'Angry' to same centroid as 'Stressed'
mood_centroids["Angry"] = mood_centroids["Stressed"]

# === Compute cosine similarity to all mood centroids
def get_emotion_scores(vector):
    return {
        mood: cosine_similarity(vector.reshape(1, -1), mood_centroids[mood].reshape(1, -1))[0][0]
        for mood in PANAS_ORDER
    }

# === Generate emotion score matrix
emotion_score_dicts = [get_emotion_scores(v) for v in vibe_vectors]
emotion_scores_df = pd.DataFrame(emotion_score_dicts, index=df.index)
emotion_scores_df = emotion_scores_df[PANAS_ORDER]

# === Final result: drop all else
df = emotion_scores_df.copy()

# === Optional: save
df.to_csv("../data/song_cluster_scores.csv")

In [20]:
df

Unnamed: 0_level_0,Stressed,Angry,Happy,Sad,Inspired,Excited,Nervous
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 - ROSÉ & Bruno Mars - APT. (Official Lyric Video),-0.466257,-0.466257,-0.393670,-0.531235,0.032888,0.627270,-0.360883
10 - Bhad Bhabie - Ms. Whitman (Official Music Video),0.766747,0.766747,0.229367,-0.127289,0.086282,-0.310024,0.591689
1000 - Cigarette Daydreams - Cage the Elephant,-0.298431,-0.298431,-0.017048,0.383885,0.002077,0.008713,-0.507485
"1001 - FloyyMenor, Cris MJ - Gata Only",0.046534,0.046534,0.676990,0.455744,-0.423362,-0.833767,-0.132058
1006 - Chappell Roan - Casual (Official Audio),-0.164100,-0.164100,-0.697161,0.123541,0.310734,0.696057,-0.150747
...,...,...,...,...,...,...,...
98 - Art Deco,0.081216,0.081216,-0.178258,0.566378,-0.132151,-0.096523,-0.014133
988 - Yan Block - 111 (Video Oficial),0.115687,0.115687,0.307580,0.520347,-0.783363,-0.673823,-0.129194
996 - On The Floor,-0.066425,-0.066425,-0.147080,-0.572738,0.103203,0.331052,0.203586
997 - Chappell Roan - The Giver (Official Lyric Video),-0.465857,-0.465857,0.236416,-0.432094,0.845755,0.328187,-0.436741


In [13]:
df.reset_index().to_csv('../data/music_clusters.csv', index=False)

In [11]:
# === Get up to 20 songs per cluster ===
sample_20_per_cluster = {}

for cluster_id in range(num_moods):
    cluster_df = df[df["cluster"] == cluster_id]
    sample_20 = cluster_df.head(20).index.tolist()
    sample_20_per_cluster[int(cluster_id)] = sample_20

# === Print ===
for cluster_id, songs in sample_20_per_cluster.items():
    print(f"\n=== Cluster {cluster_id} (Sample 20 Songs) ===")
    print(songs)


=== Cluster 0 (Sample 20 Songs) ===
['1006 - Chappell Roan - Casual (Official Audio)', '1011 - The Chain (2004 Remaster)', '1023 - Come as You Are - Nirvana', '1031 - Calvin Harris - Summer (Audio)', '1034 - Last Resort', '1036 - System of a Down - Chop Suey (Remastered 2021)', '1041 - Malcolm Todd - Chest Pain (I Love) (Official Visualizer)', '1123 - ST. CHROMA', "1145 - James Arthur - Car's Outside (Official Lyric Video)", '1162 - Eagles - Hotel California (Official Audio)', "1170 - Crowded House - Don't Dream It's Over (Audio Remastered) (HQ)", '1174 - Foo Fighters - Everlong', '1210 - Todo cambió (Camila)', '1226 - Coldplay - Paradise (Official Audio)', '1248 - Camila Cabello - Shameless (Audio)', '1296 - Sabrina Carpenter - 15 Minutes (Official Lyric Video)', '1352 - NEW MAGIC WAND (Official Audio) - Tyler, The Creator', '1373 - Heavy Is the Crown (Official Audio) - Linkin Park', '1420 - Defying Gravity (Edit)', '1528 - Gorillaz - Feel Good Inc. (Official Video)']

=== Cluster 1 