In [None]:
import os, sys
from pathlib import Path

def chdir_to_repo_root():
    p = Path.cwd()
    while p != p.parent:
        if (p / "src").exists() and (p / "data").exists():
            os.chdir(p)
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return p
        p = p.parent
    raise RuntimeError("Could not find repo root (looking for 'src' and 'data').")

ROOT = chdir_to_repo_root()
print("CWD:", ROOT)

In [None]:
import pandas as pd
from pathlib import Path
from src.models.knn_recommender import recommend_hybrid, META_PATH

In [None]:
import pandas as pd
from src.models.knn_recommender import META_PATH

m = pd.read_parquet(META_PATH)
print("columns:", sorted(m.columns.tolist())[:50])
print("tempo-like:", [c for c in m.columns if "tempo" in c.lower()])

In [None]:
import pandas as pd
from src.models.knn_recommender import META_PATH
m = pd.read_parquet(META_PATH)
print(sorted([c for c in m.columns if c in {
    "tempo","key","mode","time_signature","energy","danceability","valence",
    "loudness","acousticness","instrumentalness"
}]))

In [None]:
import os, pandas as pd
from pathlib import Path
from src.config import settings

p = settings.FMA_METADATA_DIR / "echonest.csv"
en = pd.read_csv(p, index_col=0, low_memory=False)
print("RAW sample columns:", en.columns[:25].tolist(), " ... (total:", len(en.columns), ")")

en_norm = en.copy()
en_norm.columns = (
    en_norm.columns.str.strip().str.lower()
    .str.replace(r"[^a-z0-9]+","_", regex=True)
    .str.replace(r"^(echonest_|audio_features_|audio_)", "", regex=True)
    .str.replace(r"_$", "", regex=True)
)
print("NORM sample columns:", en_norm.columns[:25].tolist())

for kw in ["tempo","key","mode","time_signature","energy","dance","valence","loud","acoustic","instrumental"]:
    hits = [c for c in en_norm.columns if kw in c]
    print(f"{kw:>14} ->", hits[:10])

In [None]:
import pandas as pd
from src.models.knn_recommender import META_PATH
m = pd.read_parquet(META_PATH)
print(sorted([c for c in m.columns if c in {
    "tempo","key","mode","time_signature","energy","danceability","valence",
    "loudness","acousticness","instrumentalness"
}]))
print("tempo non-null:", m["tempo"].notna().sum() if "tempo" in m.columns else 0)

In [None]:
from pathlib import Path
import pandas as pd
from src.models.knn_recommender import META_PATH, ROOT

meta_path = META_PATH
feats_path = ROOT / "data/processed/fma_small_feats_v2.parquet"

m = pd.read_parquet(meta_path)                       # has track_id as a column
f = pd.read_parquet(feats_path)[["track_id", "feature"]].copy()

# extract tempo from the last element of the 61-D vector
f["tempo_feat"] = f["feature"].apply(
    lambda v: float(v[-1]) if isinstance(v, (list, tuple)) and len(v) > 0 else None
)
f = f.drop(columns=["feature"])

# left-merge on track_id, prefer existing tempo but fill from tempo_feat
m = m.merge(f, on="track_id", how="left")
if "tempo" not in m.columns:
    m["tempo"] = None
m["tempo"] = m["tempo"].fillna(m["tempo_feat"])
m = m.drop(columns=["tempo_feat"])

# save back
m.to_parquet(meta_path, index=False)

# sanity
print("tempo non-null after backfill:", m["tempo"].notna().sum(), "of", len(m))

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from src.models.knn_recommender import META_PATH, ROOT

meta_path  = META_PATH
feats_path = ROOT / "data/processed/fma_small_feats_v2.parquet"

m = pd.read_parquet(meta_path).copy()
f = pd.read_parquet(feats_path)[["track_id", "feature"]].copy()

# extract tempo from the last element of the 61-D vector (ndarray/list/tuple)
f["tempo_feat"] = f["feature"].apply(
    lambda v: float(v[-1]) if isinstance(v, (list, tuple, np.ndarray)) and len(v) > 0 else np.nan
)
f = f.drop(columns=["feature"])

# ensure same dtype for merge
m["track_id"] = m["track_id"].astype(int)
f["track_id"] = f["track_id"].astype(int)

# left-merge and fill
m = m.merge(f, on="track_id", how="left")
if "tempo" not in m.columns:
    m["tempo"] = np.nan
m["tempo"] = pd.to_numeric(m["tempo"], errors="coerce")
m["tempo"] = m["tempo"].fillna(m["tempo_feat"])
m = m.drop(columns=["tempo_feat"])

# save & sanity
m.to_parquet(meta_path, index=False)
print("tempo non-null after backfill:", m["tempo"].notna().sum(), "of", len(m))
print(m.loc[m["tempo"].notna(), ["track_id","tempo"]].head())

In [None]:
import pandas as pd
from src.models.knn_recommender import META_PATH
from src.models.knn_recommender import recommend_hybrid

seed = 2
rec_ids, scores = recommend_hybrid(seed, top=10)
meta = pd.read_parquet(META_PATH).set_index("track_id")
seed_tempo = meta.loc[seed, "tempo"]
print("seed tempo:", seed_tempo)
print(meta.loc[rec_ids, ["artist_name","track_title","tempo","year"]])
print("mean |tempo diff|:", (meta.loc[rec_ids, "tempo"] - seed_tempo).abs().mean())