In [46]:
import numpy as np
import pandas as pd
import pandas as pd
from typing import List, Dict, Any


pickle1 = pd.read_pickle("data/librosa_features/librosa_shard_0.pkl")
pickle2 = pd.read_pickle("data/librosa_features/librosa_shard_1.pkl")
pickle3 = pd.read_pickle("data/librosa_features/librosa_shard_2.pkl")
pickle4 = pd.read_pickle("data/librosa_features/librosa_shard_3.pkl")
pickle5 = pd.read_pickle("data/librosa_features/librosa_shard_4.pkl")

df = pd.concat([pickle1, pickle2, pickle3, pickle4, pickle5])

# Dummy embedding funciton. Replace with real later

def embed_lyrics(text: str) -> np.ndarray:
    # Arbitrary dimension; doesn't matter since dummy retrieval ignores it
    dim = 16
    return np.zeros(dim, dtype=float)


# Dummy FAISS funciton. Replace with real later
def retrieve_similar_songs(query_embedding: np.ndarray, k: int = 5) -> List[Dict[str, Any]]:
    n = len(df)
    k = min(k, n)
    # Random unique indices
    indices = np.random.choice(n, size=k, replace=False)

    neighbors = []
    for idx in indices:
        neighbors.append({
            "index": int(idx),
            "similarity": 1.0  # constant dummy similarity
        })
    return neighbors



In [47]:
df.head()

Unnamed: 0,song_id,title,artist,query_title,query_artist,track_genre,popularity,lyrics,preview_url,track_id,...,spectral_contrast_4,spectral_contrast_5,spectral_contrast_6,spectral_contrast_7,tonnetz_1,tonnetz_2,tonnetz_3,tonnetz_4,tonnetz_5,tonnetz_6
0,4845,State of Mind,Scooter,state of mind,scooter,happy,24.0,The world seems not the same...\n\nIntroducing...,https://audio-ssl.itunes.apple.com/itunes-asse...,1692327616,...,20.798407,20.426306,18.328021,39.053367,0.197966,-0.116721,0.142559,-0.069539,-0.044986,-0.047523
1,462,Reptilia,The Strokes,reptilia,the strokes,alt-rock,75.0,[Verse 1]\nHe seemed impressed by the way you ...,https://audio-ssl.itunes.apple.com/itunes-asse...,302987569,...,15.921962,18.347155,17.382681,39.012014,0.078138,-0.077754,0.063345,0.036541,-0.011976,-0.014041
2,16017,None Of My Business,Cher Lloyd,none of my business,cher lloyd,electro,64.0,"[Chorus]\nDamn, I heard that you and her been ...",https://audio-ssl.itunes.apple.com/itunes-asse...,1438630505,...,18.045995,19.84423,18.248683,39.966514,0.013912,0.1729,-0.092766,-0.056323,-0.004173,-0.014388
3,9478,Trouble Sleeping,The Perishers,trouble sleeping,the perishers,acoustic,48.0,I'm having trouble sleeping\nYou're jumping in...,https://audio-ssl.itunes.apple.com/itunes-asse...,89335271,...,17.768873,20.125185,16.969837,28.947224,-0.118755,0.195544,0.025169,-0.130705,0.024176,0.005865
4,2822,Shot in the Dark,Ozzy Osbourne,shot in the dark,ozzy osbourne,hard-rock,65.0,[Verse 1]\nOut on the streets I'm stalking the...,https://audio-ssl.itunes.apple.com/itunes-asse...,158711416,...,15.990772,16.834263,17.184653,35.540522,-0.113671,0.023209,-0.029743,-0.051142,0.003486,-0.011837


In [48]:
audio_feature_cols = list(df.columns[df.columns.get_loc("duration")+1:])

print("Audio feature columns:", audio_feature_cols[:10], "... total:", len(audio_feature_cols))


Audio feature columns: ['rms_energy', 'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff', 'zero_crossing_rate', 'tempo', 'chroma_stft_1', 'chroma_cqt_1', 'chroma_cens_1', 'chroma_stft_2'] ... total: 81


In [49]:
def get_top_k_neighbors(df, query_embedding, k=5):
    raw_neighbors = retrieve_similar_songs(query_embedding, k=k)
    neighbors = []

    for n in raw_neighbors:
        idx = n["index"]
        row = df.iloc[idx]

        audio_features = {}

        for col in audio_feature_cols:
            val = row[col]

            # keep if scalar
            if np.isscalar(val):
                audio_features[col] = float(val)
            
            # flatten if array
            elif isinstance(val, np.ndarray):
                val = val.flatten()
                for j, v in enumerate(val):
                    audio_features[f"{col}_{j}"] = float(v)
            
            # flatten if list
            elif isinstance(val, list):
                for j, v in enumerate(val):
                    audio_features[f"{col}_{j}"] = float(v)

            else:
                try:
                    audio_features[col] = float(val)
                except Exception:
                    audio_features[col] = None

        neighbor_data = {
            "song_id": row["song_id"],
            "title": row["title"],
            "artist": row["artist"],
            "similarity": n.get("similarity", None),
            "popularity": float(row["popularity"]),
            "lyrics_snippet": row["lyrics"][:400].replace("\n", " ") + "...",
            "audio_features": audio_features
        }
        
        neighbors.append(neighbor_data)

    return neighbors


In [50]:
# continue working on promping
def build_rag_prompt_for_lyric_popularity(user_lyric: str,neighbors: List[Dict[str, Any]]) -> str:

    lines = []
    lines.append("You are an expert in music analytics, audio features, and lyric interpretation.")
    lines.append("You are given a NEW lyric and several similar songs from a dataset.")
    lines.append("")
    lines.append("Each similar song includes:")
    lines.append(" - song_id, title, artist")
    lines.append(" - lyric snippet")
    lines.append(" - popularity score (0-100)")
    lines.append(" - detailed audio features extracted from 30-second clips")
    lines.append("")
    lines.append("Your tasks are:")
    lines.append("  1. Predict a popularity score (0-100) for the NEW lyric.")
    lines.append("  2. Explain your reasoning using comparisons to the similar songs.")
    lines.append("")
    lines.append("Keep in mind the audio features of the similar songs, and explain what they mean in context to everyday people.")
    lines.append("")
    lines.append("Return your answer as VALID JSON with this exact format:")
    lines.append("{")
    lines.append('  "predicted_popularity": <number>,')
    lines.append('  "explanation": "<multi-paragraph explanation grounded in the provided songs>"')
    lines.append("}")
    lines.append("")
    lines.append("IMPORTANT:")
    lines.append("Return ONLY raw JSON.")
    lines.append("Do NOT include any code fences such as ``` json")
    lines.append("Do NOT include any explanation text outside the JSON.")
    lines.append("Do NOT add commentary before or after the JSON.")
    lines.append("Return JSON ONLY.")
    lines.append("")
    lines.append("------------------------------------------------------------")
    lines.append("NEW LYRIC:")
    lines.append(user_lyric.strip())
    lines.append("------------------------------------------------------------")
    lines.append("")
    lines.append("SIMILAR SONGS FROM THE DATASET (use these as evidence):")

    for i, nb in enumerate(neighbors, start=1):
        lines.append(f"\nNeighbor #{i}:")
        lines.append(f"  song_id: {nb['song_id']}")
        lines.append(f"  title: {nb['title']}")
        lines.append(f"  artist: {nb['artist']}")
        if nb["similarity"] is not None:
            lines.append(f"  similarity: {nb['similarity']:.4f}")
        lines.append(f"  popularity: {nb['popularity']:.2f}")
        lines.append(f"  lyrics_snippet: {nb['lyrics_snippet']}")
        lines.append("  audio_features:")

        for feat_name, feat_val in nb["audio_features"].items():
            lines.append(f"    {feat_name}: {feat_val:.4f}")

    lines.append("")
    lines.append(
        "Using ONLY the information above, estimate the popularity of the new lyric "
        "and explain your reasoning in terms of lyric similarity, artist/genre patterns, "
        "and audio features (energy, brightness, tempo, chroma, MFCCs, contrasts, tonnetz, etc.)."
        "Make sure to contextulize what the audio features mean for the average person."
    )

    return "\n".join(lines)


In [51]:
import os
import re
import json
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

def call_llm_for_popularity_and_explanation(prompt: str) -> dict:

    response = client.responses.create(
        model="gpt-4o",
        input=prompt,
        temperature=0.2,
        max_output_tokens=900
    )

    raw_text = response.output[0].content[0].text.strip()

    # Remove any ```json ...``` or ```
    raw_text = raw_text.replace("```json", "")
    raw_text = raw_text.replace("```", "")
    raw_text = raw_text.strip()

    # first try direct json parse
    try:
        return json.loads(raw_text)
    except:
        pass

    # else, find outside json block using regex
    json_matches = re.findall(r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}", raw_text, flags=re.DOTALL)

    if json_matches:
        for match in json_matches:
            try:
                return json.loads(match)
            except:
                continue

    # try to repair json with trailing commas
    repaired = re.sub(r",\s*([}\]])", r"\1", raw_text)

    try:
        return json.loads(repaired)
    except:
        pass

    # else, say failed
    print("Could not parse JSON from LLM output. Returning raw text.")
    return {
        "predicted_popularity": None,
        "explanation": raw_text
    }





In [52]:
def rag_lyric_popularity_system(df: pd.DataFrame, user_lyric: str, k_neighbors: int = 5) -> Dict[str, Any]:
    # 1) embed
    query_embedding = embed_lyrics(user_lyric)

    # 2) find songs with similar sounding lyrics
    neighbors = get_top_k_neighbors(df, query_embedding, k=k_neighbors)

    # 3) build prompt
    prompt = build_rag_prompt_for_lyric_popularity(user_lyric, neighbors)

    # 4) call llm
    llm_output = call_llm_for_popularity_and_explanation(prompt)

    pred_pop = llm_output.get("predicted_popularity", None)
    explanation = llm_output.get("explanation", "")

    return {
        "predicted_popularity": pred_pop,
        "explanation": explanation,
        "neighbors_used": neighbors,
        "prompt_sent": prompt,
    }


# Now we can test the system

In [53]:
test_lyric = "random test lyrics."
result = rag_lyric_popularity_system(df, test_lyric, k_neighbors=3)

print("Predicted popularity:", result["predicted_popularity"])
print("\nExplanation:\n", result["explanation"])


Predicted popularity: 44

Explanation:
 The new lyric 'Dancing through the shadows of the night, I feel the world ignite' shares thematic elements with the similar songs provided, particularly in its evocative and somewhat abstract imagery. This aligns closely with Neighbor #1, 'Would You Believe' by Catz 'n Dogz, which has a popularity score of 44. The lyrical content of both songs involves a sense of freedom and introspection, which may appeal to a niche audience rather than a mainstream one.

In terms of audio features, Neighbor #1 has a moderate tempo of 119.68 BPM and a spectral centroid of 1588.52, indicating a balanced brightness and energy level. The RMS energy of 0.1800 suggests a moderate loudness, which complements the introspective nature of the lyrics. The chroma features, which relate to the harmonic content, show a diverse range, suggesting a rich harmonic texture that could enhance the emotional depth of the lyrics.

Neighbor #2, 'Vacaciones' by Estopa, has a higher pop