In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack, csr_matrix

# --- 1) Load data ---
df = pd.read_csv("~/Developer/Diplomado UADE/Recomendador de discos/model_python/light_spotify_dataset.csv")


# Seleccionamos las columnas que necesitamos para el analisis
needed_cols = ["song", "artist", "Danceability", "Energy", "Positiveness", "Loudness"]
missing = [c for c in needed_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

# Eliminamos nulos
df = df.dropna(subset=["song", "artist", "Danceability", "Energy", "Positiveness", "Loudness"]).reset_index(drop=True)



# --- Preparamos las features ---

# features numericas elegidas para el modelo
num_features = df[["Danceability", "Energy", "Positiveness", "Loudness"]].to_numpy(dtype=float)

# Scalar features numericas
scaler = StandardScaler()
num_scaled = scaler.fit_transform(num_features)   
X = csr_matrix(num_scaled)   


# --- NearestNeighbors ---
best_params = {"n_neighbors": 10, "metric": "cosine", "algorithm": "brute"}
knn = NearestNeighbors(**best_params)
knn.fit(X)


## Hiperparametros elegidos
## 10 - queremos una seleccion de 10 canciones
## metric: cosine - mide el angulo entre los vectores, utilizado mucho en datasets de musica, se focaliza en similaridad de patron en vez de 
##   distancia cruda
## algorith: brute - ball_tree y kc_tree no funcionan bien con cosine. Low-dimensional (solo 4 features). 


# --- single feature vector by row ---
def build_feature_vector(row_idx: int):
    row = df.loc[row_idx, ["Danceability", "Energy", "Positiveness", "Loudness"]].to_numpy(dtype=float).reshape(1, -1)
    e_scaled = scaler.transform(row)
    return csr_matrix(e_scaled)

    

# --- Cancion por index junto con artista ---
def get_track_index(track: str, artist: str | None = None) -> int:
    name_mask = df["song"].str.casefold().str.strip() == track.casefold().strip()
    if not name_mask.any():
        raise ValueError(f"Track '{track}' not found.")

    if artist is not None and "artist" in df.columns:
        artist_mask = df["artist"].str.casefold().str.strip() == artist.casefold().strip()
        mask = name_mask & artist_mask
        if not mask.any():
            options = df.loc[name_mask, "artist"].dropna().unique().tolist()
            raise ValueError(f"No match for '{track}' by '{artist}'. "
                             f"Available artists for that title: {options}")
        idxs = df.index[mask].tolist()
    else:
        idxs = df.index[name_mask].tolist()
        if len(idxs) > 1:
            options = df.loc[idxs, "artist"].dropna().unique().tolist()
            raise ValueError(f"Ambiguous track title '{track}'. Please specify artist. "
                             f"Options: {options}")

    return idxs[0]

# --- Encuentra canciones similares (case sensitive) ---
def recommend_by_track_name(track: str, top_k: int = 10, artist: str | None = None):
    idx = get_track_index(track, artist)
    q = build_feature_vector(idx)

    distances, indices = knn.kneighbors(q, n_neighbors=top_k + 1)
    recs = []
    for d, i in zip(distances[0], indices[0]):
        if i == idx:  # skip the seed itself
            continue
        recs.append({
            "song": df.loc[i, "song"],
            "artist": df.loc[i, "artist"],
            "Danceability": df.loc[i, "Danceability"],
            "Energy": df.loc[i, "Energy"],
            "Positiveness": df.loc[i, "Positiveness"],
            "Loudness": df.loc[i, "Loudness"],
            "distance": float(d)
        })
        if len(recs) == top_k:
            break
    return recs

# --- Guardar artifacts para frontend ---
artifacts = {
    "knn_model": knn,
    "scaler": scaler,
    "best_params": best_params,
    "feature_matrix_shape": X.shape,
    "track_index": df[["song", "artist", "Danceability", "Energy", "Positiveness", "Loudness"]],
}
joblib.dump(artifacts, "music_recommender_numeric_small.joblib")
print("💾 Saved: music_recommender_numeric_small.joblib")


💾 Saved: music_recommender_numeric_small.joblib


In [3]:
# --- smoke test ---
try:
    user_track = input("Enter a track name: ").strip()
    user_artist = None

    try:
        recs = recommend_by_track_name(user_track, top_k=10)
        print(f"\nRecommendations for: {user_track}\n")
    except ValueError as amb:
        text = str(amb)
        if "Please specify artist" in text:
            print(text)
            user_artist = input("Enter the artist to disambiguate: ").strip()
            recs = recommend_by_track_name(user_track, top_k=10, artist=user_artist)
            print(f"\nRecommendations for: {user_track} | {user_artist}\n")
        else:
            raise

    for r in recs:
        print(f"  - {r['song']} | {r['artist']} | "
              f"Danceability={r['Danceability']} | Energy={r['Energy']} | "
              f"Positiveness={r['Positiveness']} | Loudness={r['Loudness']} | "
              f"dist={r['distance']:.3f}")

except Exception as e:
    print("Smoke test skipped:", e)

Enter a track name:  Dance


Ambiguous track title 'Dance'. Please specify artist. Options: ['ABBA', 'Indiana Bible College', 'James Taylor', 'Planetshakers', 'Roy Orbison', 'Gary Numan', 'Tim Baker', 'Speaker Knockerz', 'Rick Astley', 'Nasty C', 'Nas', 'Megan Thee Stallion', 'Maejor', 'Lotus Juice', 'DNCE', 'CLMD & Tungevaag', '1nonly']


Enter the artist to disambiguate:  Rick astley



Recommendations for: Dance | Rick astley

  - Real Nis | The Diplomats | Danceability=67 | Energy=74 | Positiveness=65 | Loudness=-5.0 | dist=0.000
  - Furniture | Maude Latour | Danceability=72 | Energy=82 | Positiveness=78 | Loudness=-2.87 | dist=0.000
  - Nameless World | Skip The Use | Danceability=70 | Energy=78 | Positiveness=71 | Loudness=-3.8 | dist=0.000
  - Back It Up | Monty | Danceability=70 | Energy=78 | Positiveness=71 | Loudness=-3.97 | dist=0.000
  - Get Down | Lil Wayne | Danceability=72 | Energy=82 | Positiveness=76 | Loudness=-3.04 | dist=0.000
  - Get Down | Lil Wayne | Danceability=72 | Energy=82 | Positiveness=76 | Loudness=-3.04 | dist=0.000
  - Playas Gon Play | 3LW | Danceability=66 | Energy=72 | Positiveness=62 | Loudness=-5.47 | dist=0.000
  - Kick You Out | Loren Gray | Danceability=66 | Energy=73 | Positiveness=63 | Loudness=-5.18 | dist=0.000
  - Bang Bang | Jessie J, Ariana Grande & Nicki Minaj | Danceability=71 | Energy=79 | Positiveness=75 | Loudness=-

In [9]:
track = "Dance"
artist = "Rick Astley"

# 1) find the row index
idx = get_track_index(track, artist)

# 2) build query vector for that row
q = build_feature_vector(idx)

# 3) get neighbors (over-ask a bit so we can skip seed/dupes)
k = 10
distances, indices = knn.kneighbors(q, n_neighbors=k + 1)

seen = set()
count = 0
for d, i in zip(distances[0], indices[0]):
    if i == idx:
        continue  # skip the seed itself

    # similarity for cosine metric
    sim = max(0.0, min(1.0, 1.0 - float(d)))  # clamp to [0,1]

    # optional: de-duplicate same (song, artist)
    key = (df.at[i, "song"], df.at[i, "artist"])
    if key in seen:
        continue
    seen.add(key)

    song = df.at[i, "song"]
    art  = df.at[i, "artist"]
    print(f"{song} | {art} | dist={d:.8f} | sim={sim:.4f}")

    count += 1
    if count == k:
        break

Real Nis | The Diplomats | dist=0.00008756 | sim=0.9999
Furniture | Maude Latour | dist=0.00013663 | sim=0.9999
Nameless World | Skip The Use | dist=0.00014858 | sim=0.9999
Back It Up | Monty | dist=0.00022625 | sim=0.9998
Get Down | Lil Wayne | dist=0.00022776 | sim=0.9998
Playas Gon Play | 3LW | dist=0.00031362 | sim=0.9997
Kick You Out | Loren Gray | dist=0.00033958 | sim=0.9997
Bang Bang | Jessie J, Ariana Grande & Nicki Minaj | dist=0.00034088 | sim=0.9997
So Much It Hurts | Niki & The Dove | dist=0.00034088 | sim=0.9997
