This notebook begins the steps for implementing the data into a RAG system. We give the LLM the user-inputed lyrics, and we then find the embedding, use FAISS to compare and grab k similar lyrics from our data, and the LLM predicts popularity and explain why given the popularity and audio features of the similar songs. 

There is sitll work to be done:

- we use the LLM for prediciton and explinations. using a prediction model instead can lead to better reproducibility (consistant scores)
- we currently use dummy lyric embedding and FAISS similarity functions
- better prompting techniques
- deep eval to evaluate the perforamnce of RAG

## FAISS INDEX

FileNotFoundError: [Errno 2] No such file or directory: 'data/lyric_embeddings/librosa_shard_0_clean_ml_with_emb.pkl'

In [1]:
import numpy as np
import pandas as pd
import pandas as pd
from typing import List, Dict, Any


pickle1 = pd.read_pickle("data/lyric_embeddings/librosa_shard_0_clean_ml_with_emb.pkl")
pickle2 = pd.read_pickle("data/lyric_embeddings/librosa_shard_1_clean_ml_with_emb.pkl")
pickle3 = pd.read_pickle("data/lyric_embeddings/librosa_shard_2_clean_ml_with_emb.pkl")
pickle4 = pd.read_pickle("data/lyric_embeddings/librosa_shard_3_clean_ml_with_emb.pkl")
pickle5 = pd.read_pickle("data/lyric_embeddings/librosa_shard_4_clean_ml_with_emb.pkl")

df = pd.concat([pickle1, pickle2, pickle3, pickle4, pickle5])

# Dummy embedding funciton. Replace with real later
def embed_lyrics(text: str) -> np.ndarray:
    # Arbitrary dimension; doesn't matter since dummy retrieval ignores it
    dim = 16
    return np.zeros(dim, dtype=float)


# Dummy FAISS funciton. Replace with real later
def retrieve_similar_songs(query_embedding: np.ndarray, k: int = 5) -> List[Dict[str, Any]]:
    n = len(df)
    k = min(k, n)
    # Random unique indices
    indices = np.random.choice(n, size=k, replace=False)

    neighbors = []
    for idx in indices:
        neighbors.append({
            "index": int(idx),
            "similarity": 1.0  # constant dummy similarity
        })
    return neighbors



FileNotFoundError: [Errno 2] No such file or directory: 'data/lyric_embeddings/librosa_shard_0_clean_ml_with_emb.pkl'

In [None]:
audio_feature_cols = list(df.columns[df.columns.get_loc("duration")+1:])
df.head()

In [None]:
def get_top_k_neighbors(df, query_embedding, k=5):
    raw_neighbors = retrieve_similar_songs(query_embedding, k=k)
    neighbors = []

    for n in raw_neighbors:
        idx = n["index"]
        row = df.iloc[idx]

        audio_features = {}

        for col in audio_feature_cols:
            val = row[col]

            # keep if scalar
            if np.isscalar(val):
                audio_features[col] = float(val)
            
            # flatten if array
            elif isinstance(val, np.ndarray):
                val = val.flatten()
                for j, v in enumerate(val):
                    audio_features[f"{col}_{j}"] = float(v)
            
            # flatten if list
            elif isinstance(val, list):
                for j, v in enumerate(val):
                    audio_features[f"{col}_{j}"] = float(v)

            else:
                try:
                    audio_features[col] = float(val)
                except Exception:
                    audio_features[col] = None

        neighbor_data = {
            "song_id": row["song_id"],
            "title": row["title"],
            "artist": row["artist"],
            "similarity": n.get("similarity", None),
            "popularity": float(row["popularity"]),
            "lyrics_snippet": row["lyrics"][:400].replace("\n", " ") + "...",
            "audio_features": audio_features
        }
        
        neighbors.append(neighbor_data)

    return neighbors


In [None]:
# continue working on promping
def build_rag_prompt_for_lyric_popularity(user_lyric: str,neighbors: List[Dict[str, Any]]) -> str:

    lines = []
    lines.append("You are an expert in music analytics, audio features, and lyric interpretation.")
    lines.append("You are given a NEW lyric and several similar songs from a dataset.")
    lines.append("")
    lines.append("Each similar song includes:")
    lines.append(" - song_id, title, artist")
    lines.append(" - lyric snippet")
    lines.append(" - popularity score (0-100)")
    lines.append(" - detailed audio features extracted from 30-second clips")
    lines.append("")
    lines.append("Your tasks are:")
    lines.append("  1. Predict a popularity score (0-100) for the NEW lyric.")
    lines.append("  2. Explain your reasoning using comparisons to the similar songs.")
    lines.append("")
    lines.append("Keep in mind the audio features of the similar songs, and explain what they mean in context to everyday people.")
    lines.append("")
    lines.append("Return your answer as VALID JSON with this exact format:")
    lines.append("{")
    lines.append('  "predicted_popularity": <number>,')
    lines.append('  "explanation": "<multi-paragraph explanation grounded in the provided songs>"')
    lines.append("}")
    lines.append("")
    lines.append("IMPORTANT:")
    lines.append("Return ONLY raw JSON.")
    lines.append("Do NOT include any code fences such as ``` json")
    lines.append("Do NOT include any explanation text outside the JSON.")
    lines.append("Do NOT add commentary before or after the JSON.")
    lines.append("Return JSON ONLY.")
    lines.append("")
    lines.append("------------------------------------------------------------")
    lines.append("NEW LYRIC:")
    lines.append(user_lyric.strip())
    lines.append("------------------------------------------------------------")
    lines.append("")
    lines.append("SIMILAR SONGS FROM THE DATASET (use these as evidence):")

    for i, nb in enumerate(neighbors, start=1):
        lines.append(f"\nNeighbor #{i}:")
        lines.append(f"  song_id: {nb['song_id']}")
        lines.append(f"  title: {nb['title']}")
        lines.append(f"  artist: {nb['artist']}")
        if nb["similarity"] is not None:
            lines.append(f"  similarity: {nb['similarity']:.4f}")
        lines.append(f"  popularity: {nb['popularity']:.2f}")
        lines.append(f"  lyrics_snippet: {nb['lyrics_snippet']}")
        lines.append("  audio_features:")

        for feat_name, feat_val in nb["audio_features"].items():
            lines.append(f"    {feat_name}: {feat_val:.4f}")

    lines.append("")
    lines.append(
        "Using ONLY the information above, estimate the popularity of the new lyric "
        "and explain your reasoning in terms of lyric similarity, artist/genre patterns, "
        "and audio features (energy, brightness, tempo, chroma, MFCCs, contrasts, tonnetz, etc.)."
        "Make sure to contextulize what the audio features mean for the average person."
    )

    return "\n".join(lines)


In [None]:
import os
import re
import json
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

def call_llm_for_popularity_and_explanation(prompt: str) -> dict:

    response = client.responses.create(
        model="gpt-4o",
        input=prompt,
        temperature=0.2,
        max_output_tokens=900
    )

    raw_text = response.output[0].content[0].text.strip()

    # Remove any ```json ...``` or ```
    raw_text = raw_text.replace("```json", "")
    raw_text = raw_text.replace("```", "")
    raw_text = raw_text.strip()

    # first try direct json parse
    try:
        return json.loads(raw_text)
    except:
        pass

    # else, find outside json block using regex
    json_matches = re.findall(r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}", raw_text, flags=re.DOTALL)

    if json_matches:
        for match in json_matches:
            try:
                return json.loads(match)
            except:
                continue

    # try to repair json with trailing commas
    repaired = re.sub(r",\s*([}\]])", r"\1", raw_text)

    try:
        return json.loads(repaired)
    except:
        pass

    # else, say failed
    print("Could not parse JSON from LLM output. Returning raw text.")
    return {
        "predicted_popularity": None,
        "explanation": raw_text
    }





In [None]:
def rag_lyric_popularity_system(df: pd.DataFrame, user_lyric: str, k_neighbors: int = 5) -> Dict[str, Any]:
    # 1) embed
    query_embedding = embed_lyrics(user_lyric)

    # 2) find songs with similar sounding lyrics
    neighbors = get_top_k_neighbors(df, query_embedding, k=k_neighbors)

    # 3) build prompt
    prompt = build_rag_prompt_for_lyric_popularity(user_lyric, neighbors)

    # 4) call llm
    llm_output = call_llm_for_popularity_and_explanation(prompt)

    pred_pop = llm_output.get("predicted_popularity", None)
    explanation = llm_output.get("explanation", "")

    return {
        "predicted_popularity": pred_pop,
        "explanation": explanation,
        "neighbors_used": neighbors,
        "prompt_sent": prompt,
    }


# Now we can test the system

In [None]:
test_lyric = "random test lyrics."
result = rag_lyric_popularity_system(df, test_lyric, k_neighbors=3)

print("Predicted popularity:", result["predicted_popularity"])
print("\nExplanation:\n", result["explanation"])
