In [18]:
from pathlib import Path
import pandas as pd
import json

import re
from collections import defaultdict

In [19]:
with open("../indexes/indexes.json") as f:
    indexes = json.load(f)

genre_index = indexes["genre"]
mood_index = indexes["mood"]
title_index = indexes["title"]
artist_index = indexes["artist"]
energy_index = indexes["energy"]


In [20]:
print("Genres:", list(genre_index.keys())[:10])
print("Moods:", list(mood_index.keys())[:10])
print("Energy labels:", list(energy_index.keys())[:10])

print("Sample artist tokens:", list(artist_index.keys())[:10])
print("Sample title tokens:", list(title_index.keys())[:10])


Genres: ['acoustic', 'new-age', 'opera', 'party', 'mpb', 'pagode', 'metalcore', 'minimal-techno', 'metal', 'progressive-house']
Moods: ['focus', 'sad', 'happy', 'chill', 'hype']
Energy labels: ['medium', 'energetic', 'calm']
Sample artist tokens: ['hoshino', 'gen', 'clannad', 'orchestra', 'brian', 'josh', 'london', 'groban', 'mcknight', 'symphony']
Sample title tokens: ['comedy', 'the', 'remastered', '2003', 'wild', 'cry', 'heard', 'we', 'on', 'high']


In [21]:
def retrieve_candidates(
    genres=None,
    mood=None,
    energy=None,
    title_query=None,
    artist_query=None
):
    sets = []

    # 1️⃣ Text search takes precedence
    text_hits = None

    if title_query:
        words = re.findall(r'\w+', title_query.lower())
        title_sets = [set(title_index.get(w, [])) for w in words]
        if title_sets:
            text_hits = set.intersection(*title_sets)

    if artist_query:
        words = re.findall(r'\w+', artist_query.lower())
        artist_sets = [set(artist_index.get(w, [])) for w in words]
        if artist_sets:
            artist_hits = set.intersection(*artist_sets)
            text_hits = artist_hits if text_hits is None else (text_hits & artist_hits)

    if text_hits is not None:
        sets.append(text_hits)

    # 2️⃣ Bucket filters (optional)
    if genres:
        genre_hits = set()
        for g in genres:
            genre_hits |= set(genre_index.get(g.lower(), []))
        sets.append(genre_hits)

    if mood:
        sets.append(set(mood_index.get(mood.lower(), [])))

    if energy:
        sets.append(set(energy_index.get(energy.lower(), [])))

    if not sets:
        return []

    return list(set.intersection(*sets))


In [22]:
cands = retrieve_candidates(artist_query="Lady Gaga")
len(cands), cands[:10]


(67,
 ['1ncPxe9s9jkoe1HcshDitx',
  '3cjyW2ZlXMf5ZQvGo5dKUO',
  '69C7u1nZcyZmuZXNNF5U9C',
  '2HJvS1fhaJbPxI9gT0mPjb',
  '3Wceo1KylCeOUKJv8EVrMy',
  '5LHnaiB8E3x95ZbYFmOUfk',
  '4fRUwxL1tLYQViPssf6eYp',
  '204PLz9hMiXFIJu0wWZr79',
  '1MNnfeAXbz0TO7wt61mKIX',
  '0BkWoJ1DbYrruv8srcPkmp'])

In [None]:
cands = retrieve_candidates(mood="sad", energy="calm")
len(cands)


12163

In [27]:
pathCatalog = Path(r"C:\Users\josep\Downloads\Winter 2026\jukejam\data\processed\SONG_CATALOG.csv")

SONG_CATALOG = pd.read_csv(pathCatalog)

track_lookup = SONG_CATALOG.set_index("track_id").to_dict(orient="index")


In [28]:
def get_track_info(track_id):
    return track_lookup.get(track_id, None)


In [33]:
for tid in cands[:5]:
    info = get_track_info(tid)
    print(info)
    print(info["title"], "-", info["artist_name"], "|", info["mood_bucket"], info["energy_label"])



{'title': 'Shared Worlds', 'artist_name': 'Medwyn Goodall', 'album_name': 'Way of the Dolphin', 'genre': 'new-age', 'duration_ms': 403826, 'popularity': 20, 'danceability': 0.129, 'energy': 0.302, 'valence': 0.0388, 'tempo': 89.406, 'acousticness': 0.091, 'instrumentalness': 0.634, 'liveness': 0.0905, 'speechiness': 0.0344, 'loudness': -18.082, 'mode': 1, 'key': 11, 'time_signature': 4, 'mood_bucket': 'sad', 'danceability_label': 'low', 'energy_label': 'calm', 'mood_label': 'sad', 'tempo_label': 'slow', 'acoustic_label': 'electronic', 'loudness_label': 'quiet', 'mode_label': 'major (bright)'}
Shared Worlds - Medwyn Goodall | sad calm
{'title': '垃圾', 'artist_name': 'Candy Lo', 'album_name': '喜歡戀愛"粵語精選"', 'genre': 'cantopop', 'duration_ms': 228493, 'popularity': 42, 'danceability': 0.528, 'energy': 0.36, 'valence': 0.225, 'tempo': 107.971, 'acousticness': 0.739, 'instrumentalness': 1.31e-06, 'liveness': 0.101, 'speechiness': 0.0369, 'loudness': -11.301, 'mode': 0, 'key': 0, 'time_signatu