In [1]:
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

ROOT = Path(os.path.abspath(os.path.join(os.getcwd(), ".."))) if ("EDA" in os.getcwd() or "scripts" in os.getcwd()) else Path(os.getcwd())

DATA_CSV   = ROOT / "data" / "clean_data" / "spotify_features_with_info.csv"
MODELS_DIR = ROOT / "models"
ART_DIR    = ROOT / "artifacts"
EMB_DIR    = ART_DIR / "embeddings"
TAB_DIR    = ART_DIR / "tables"
FIG_DIR    = ART_DIR / "figures"

for d in [MODELS_DIR, ART_DIR, EMB_DIR, TAB_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

assert DATA_CSV.exists(), f"Missing dataset: {DATA_CSV}"
df = pd.read_csv(DATA_CSV)

name_col   = "track_name"  if "track_name"  in df.columns else None
artist_col = "artist_name" if "artist_name" in df.columns else None

exclude = {"popularity", "popular_flag"}
num_cols = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
X_audio = df[num_cols].copy()

scaler = joblib.load(MODELS_DIR / "scaler.joblib")
X_aduio_scaled = scaler.transform(X_audio)

print("Rows:", len(df), "| audio features:", len(num_cols))
print("Examples:", num_cols[:8])

Rows: 130326 | audio features: 50
Examples: ['duration_ms', 'key', 'mode', 'tempo', 'time_signature', 'total_beats', 'energy_sqrt', 'danceability_sqrt']


**Title normalization**

In [2]:
def normalize_title(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.lower().strip()
    replacements = {
        "–": "-", "—": "-", "’": "'", "“": '"', "”": '"',
        " (feat.": " feat ", "(feat.": " feat ", " feat.": " feat ",
        " (live)": " live", "(live)": " live"
    }
    for k, v in replacements.items():
        s = s.replace(k, v)
    return " ".join(s.split())

titles_raw = df[name_col].fillna("") if name_col else pd.Series([""] * len(df))
titles = titles_raw.apply(normalize_title)
print("Sample titles:", titles.head().tolist())

Sample titles: ['so temporary', 'el olotito', 'reckless', 'dämonen', 'smoke too much']


**Sentence embedding**

In [5]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "all-MiniLM-L6-v2"
encoder = SentenceTransformer(MODEL_NAME)

E_text = encoder.encode(titles.tolist(), normalize_embeddings = True, show_progress_bar = True)
E_text = np.asarray(E_text)
print("Enbedding matrix shape:", E_text.shape)
np.save(EMB_DIR / "title_embeddings_minilm.npy", E_text)

meta = {
    "model" : MODEL_NAME,
    "normalized" : True,
    "shape" : E_text.shape,
    "name_col" : name_col,
    "artist_col" : artist_col
}
with open(EMB_DIR / "embeddings_meta.json", "w") as f:
    json.dump(meta, f, indent = 2)
print("Saved embeddings & meta.")




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4073 [00:00<?, ?it/s]

Enbedding matrix shape: (130326, 384)
Saved embeddings & meta.


**Retrieval**

In [29]:
import re
from sklearn.metrics.pairwise import cosine_similarity

def _tok(s: str):
    return re.findall(r"[a-z0-9]+", s.lower()) if isinstance(s, str) else []

ANTONYMS = {
    "happy": {"sad", "melancholy", "gloomy", "downbeat"},
    "sad": {"happy", "cheerful", "uplifting"},
    "chill": {"hype", "intense", "aggressive"},
    "upbeat": {"downbeat"},
    "calm": {"angry", "aggressive"},
    "dark": {"bright", "uplifting", "happy"},
    "bright": {"dark", "gloomy"}
}

def _keyword_bonus_and_penalty(query_tokens, title_tokens):
    qt = set(query_tokens)
    tt = set(title_tokens)

    overlap = qt & tt
    bonus = min(len(overlap), 4) * 0.02

    penalty = 0.0
    for q in qt:
        if q in ANTONYMS and (ANTONYMS[q] & tt):
            penalty += 0.04

    return bonus, penalty, sorted(list(overlap))[:4]

def search_text(query: str, top_k: int = 10, w_cos: float = 0.8, w_kw: float = 0.2, w_pen: float = 0.4):
    q_norm = normalize_title(query)
    q_vec = encoder.encode([q_norm], normalize_embeddings = True)
    cos = cosine_similarity(q_vec, E_text)[0]

    q_tokens = _tok(q_norm)
    prelim_k = max(top_k * 5, 50)
    prelim_index = np.argpartition(-cos, range(min(prelim_k, len(cos))))[:prelim_k]

    bonuses = np.zeros_like(cos)
    penalties = np.zeros_like(cos)
    overlaps = [""] * len(cos)

    for i in prelim_index:
        t_tokens = _tok(titles.iloc[i])
        b, p, ov = _keyword_bonus_and_penalty(q_tokens, t_tokens)
        bonuses[i] = b
        penalties[i] = p
        overlaps[i] = ", ".join(ov)

    final = (w_cos * cos) - (w_pen * penalties) + (w_kw * bonuses)

    top_index = np.argpartition(-final, range(min(top_k, len(final))))[:top_k]
    top_index = top_index[np.argsort(-final[top_index])]

    cols = [c for c in ["track_name", "artist_name", "popularity"] if c in df.columns]
    out = df.iloc[top_index][cols].copy()
    out.insert(0, "score", final[top_index].round(4))
    out.insert(1, "similarity", cos[top_index].round(4))
    out.insert(2, "kw_bonus", bonuses[top_index].round(4))
    out.insert(3, "kw_penalty", penalties[top_index].round(4))
    out["why_matched_tokens"] = [overlaps[i] for i in top_index]
    return out.reset_index(drop = True)

display(search_text("happy songs", top_k = 10))

Unnamed: 0,score,similarity,kw_bonus,kw_penalty,track_name,artist_name,popularity,why_matched_tokens
0,0.7389,0.9187,0.02,0.0,Happy Song,Jimi Charles Moody,34,happy
1,0.5763,0.7154,0.02,0.0,Happy Rock,The Electric Light Bulb Orchestra,7,happy
2,0.563,0.7037,0.0,0.0,Music for Happiness,Joey Pecoraro,61,
3,0.5595,0.6943,0.02,0.0,Happy Now - Acoustic,Zedd,64,happy
4,0.5521,0.6901,0.0,0.0,Happyness Rock,Image Sounds,0,
5,0.5459,0.6774,0.02,0.0,Happy (Instrumental),Big Beats,0,happy
6,0.5309,0.6587,0.02,0.0,Happy Happy,Shashank Sheshagiri,7,happy
7,0.5309,0.6587,0.02,0.0,Happy Happy,Nacho,63,happy
8,0.5303,0.6629,0.0,0.0,Love Song,Sean C. Johnson,30,
9,0.5303,0.6629,0.0,0.0,Love Song,Jared Grimble,37,


**Prediction**

In [16]:
y = df["popularity"].astype(float).values

X_fused = np.hstack([E_text, X_aduio_scaled])

X_tr, X_te, y_tr, y_te = train_test_split(X_fused, y, test_size = 0.2, random_state = 42)

ridge = Ridge(alpha = 2.0, random_state=42)
ridge.fit(X_tr, y_tr)
pred = ridge.predict(X_te)

R2 = round(float(r2_score(y_te, pred)), 4)
MAE = round(float(mean_absolute_error(y_te, pred)), 4)
RMSE = round(float(mean_squared_error(y_te, pred)), 4)
print(f"Ridge (Fused) -> R2 = {R2} | MAE = {MAE} | RMSE = {RMSE}")

joblib.dump(ridge, MODELS_DIR / "ridge_fused_embeddings.joblib")
with open(ART_DIR / "metrics" / "nlp_fusion_metrics.json", "w") as f:
    json.dump({"model":"Ridge(fused)", "R2":R2, "MAE":MAE, "RMSE":RMSE}, f, indent = 2)

Ridge (Fused) -> R2 = 0.1878 | MAE = 14.5273 | RMSE = 310.8547


**Seed track**

In [31]:
df["_title_norm"] = titles
df["_artist_norm"] = df[artist_col].fillna("").apply(normalize_title) if artist_col else ""

def get_track_index(partial_title: str, partial_artist: str = None):
    if not isinstance(partial_title, str):
        return None
    t = normalize_title(partial_title)
    a = normalize_title(partial_artist) if isinstance(partial_artist, str) else ""
    mask = df["_title_norm"].str.contains(t, case = False, regex = False)
    if a:
        mask &= df["_artist_norm"].str.contains(a, case = False, regex = False)
    index = np.where(mask.values)[0]
    return int(index[0]) if len(index) else None

def topk_neighbors_fused(seed_index: int, top_k: int = 10):
    sims = cosine_similarity(X_fused[seed_index:seed_index + 1], X_fused)[0]
    sims[seed_index] = -np.inf
    top_index = np.argpartition(-sims, range(top_k))[:top_k]
    top_index = top_index[np.argsort(-sims[top_index])]

    cols = [c for c in ["track_name", "artist_name", "popularity"] if c in df.columns]
    out = df.iloc[top_index][cols].copy()
    out.insert(0, "similarity", sims[top_index].round(4))
    return out.reset_index(drop = True), top_index

seed = get_track_index("Ghost Town", "Kanye")
print("Seed index:", seed)
if seed is not None:
    recs, _ = topk_neighbors_fused(seed, top_k = 10)
    print("Seed:", df.loc[seed, name_col], "-", df.loc[seed, artist_col])
    display(recs)
else:
    print("Seed track not found.")

Seed index: 104409
Seed: Ghost Town - Kanye West


Unnamed: 0,similarity,track_name,artist_name,popularity
0,0.9347,The Bones,Maren Morris,87
1,0.9329,Fast,Juice WRLD,88
2,0.9317,FAKE LOVE,BTS,80
3,0.9307,Watch (feat. Lil Uzi Vert & Kanye West),Travis Scott,73
4,0.9279,Lucid Dreams,Juice WRLD,91
5,0.9256,"Zero - From the Original Motion Picture ""Ralph...",Imagine Dragons,77
6,0.9161,"Back to Life - from ""Bumblebee""",Hailee Steinfeld,75
7,0.911,Chlorine,Twenty One Pilots,84
8,0.9068,Price On My Head (feat. The Weeknd),NAV,81
9,0.9051,Blame It On Me,Post Malone,76


**Re-weight**

In [24]:
audio_weights = np.ones(X_aduio_scaled.shape[1], dtype = float)
pref_boost = {
    "danceability": 1.2,
    "valence": 1.15,
    "energy": 1.2
}
for k, w in pref_boost.items():
    if k in num_cols:
        audio_weights[num_cols.index(k)] = w

X_fused_weighted = np.hstack([E_text, X_aduio_scaled * audio_weights])

def topk_neighbors_fused_weighted(seed_index: int, top_k: int = 10):
    sims = cosine_similarity(X_fused_weighted[seed_index: seed_index + 1], X_fused_weighted)[0]
    sims[seed_index] = -np.inf
    top_index = np.argpartition(-sims, range(top_k))[:top_k]
    top_index = top_index[np.argsort(-sims[top_index])]
    cols = [c for c in ["track_name", "artist_name", "popularity"] if c in df.columns]
    out = df.iloc[top_index][cols].copy()
    out.insert(0, "similarity", sims[top_index].round(4))
    return out.reset_index(drop = True), top_index