One-click “Reset & Run” master cell

In [1]:
# --- MASTER CELL: safe re-run after any restart ---
# It will:
# 1) Detect repo_root, 2) load .env, 3) pick only Top Shelf files,
# 4) build records, 5) embed in memory, 6) answer one test query.

%pip -q install --upgrade numpy pandas openai python-dotenv tiktoken

from pathlib import Path
import os, json, pandas as pd, numpy as np
from dotenv import load_dotenv
from typing import List, Dict

# ---------- 1) detect repo root ----------
def detect_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p/".env").exists() or (p/"README.md").exists() or (p/"data").exists():
            return p
    return start

CWD = Path.cwd()
repo_root = detect_repo_root(CWD)
print("CWD:", CWD)
print("Repo root:", repo_root)

# ---------- 2) load .env ----------
loaded = load_dotenv(repo_root / ".env", override=True)
print("Env loaded:", loaded)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise SystemExit("❌ OPENAI_API_KEY not found. Put it in repo_root/.env then re-run this cell.")

# ---------- 3) collect ONLY Top Shelf files ----------
data_dir = repo_root / "data"
if not data_dir.exists():
    raise SystemExit(f"❌ data folder not found at {data_dir}. Create it and add your files.")

topshelf_csv  = sorted([p for p in data_dir.glob("*.csv")  if "top" in p.name.lower() and "shelf" in p.name.lower()])
topshelf_json = sorted([p for p in data_dir.glob("*.json") if "top" in p.name.lower() and "shelf" in p.name.lower()])

if not (topshelf_csv or topshelf_json):
    print("⚠️ No Top Shelf files matched. Showing all data files instead:")
    print([f.name for f in data_dir.iterdir()])
    raise SystemExit("Rename your files to include 'top' and 'shelf', or adjust the filters in this cell.")

print("Using Top Shelf files:", [p.name for p in (topshelf_json + topshelf_csv)])

# ---------- 4) build records ----------
records: List[Dict] = []

def add_record(text: str, rid: str):
    text = (text or "").strip()
    if text:
        records.append({"id": rid, "text": text})

def join_item_fields(item: dict) -> str:
    # Prefer explicit text/content
    if isinstance(item, dict):
        txt = (item.get("text") or item.get("content") or "").strip()
        if txt:
            return txt
        # Otherwise join string-like fields
        parts = []
        for k in sorted(item.keys()):
            v = item[k]
            if isinstance(v, (str, int, float)) and str(v).strip():
                parts.append(f"{k}: {str(v).strip()}")
        return " | ".join(parts)
    return str(item)

# JSON first (if present)
for jp in topshelf_json:
    data = json.loads(jp.read_text(encoding="utf-8"))
    if not isinstance(data, list) or not data:
        continue
    for i, item in enumerate(data):
        add_record(join_item_fields(item), f"{jp.name}-{i}")

# Then CSVs
for cp in topshelf_csv:
    try:
        df = pd.read_csv(cp)
    except Exception:
        try:
            df = pd.read_csv(cp, encoding="utf-8")
        except Exception:
            df = pd.read_csv(cp, encoding="latin-1")
    # Choose columns (auto if not set)
    likely = {"artist","album","title","description","review","genre","notes","track","year"}
    csv_columns_to_join = [c for c in df.columns if c.lower() in likely] or [c for c in df.columns if df[c].dtype=="object"]

    def row_to_text(row):
        parts = []
        for c in csv_columns_to_join:
            val = row.get(c, "")
            if pd.notna(val) and str(val).strip():
                parts.append(f"{c}: {str(val).strip()}")
        return " | ".join(parts)

    for i, r in df.iterrows():
        add_record(row_to_text(r), f"{cp.name}-{i}")

if not records:
    raise SystemExit("❌ No records built from Top Shelf files. Check file contents/columns.")

print(f"✅ Built {len(records)} Top Shelf records.")
print("Example snippet:\n", records[0]["text"][:400])

# ---------- 5) embed in memory ----------
from openai import OpenAI
client = OpenAI()  # reads OPENAI_API_KEY from env

EMBEDDING_MODEL = "text-embedding-3-small"

MAX_DOCS = min(200, len(records))   # keep first run fast
docs = records[:MAX_DOCS]
texts = [d["text"] for d in docs]
ids   = [d["id"]   for d in docs]

def embed_texts(texts):
    vecs = []
    for t in texts:
        e = client.embeddings.create(model=EMBEDDING_MODEL, input=t).data[0].embedding
        vecs.append(np.array(e, dtype=np.float32))
    return np.vstack(vecs)

print(f"🔢 Embedding {len(texts)} docs…")
doc_vecs = embed_texts(texts)
doc_vecs = doc_vecs / (np.linalg.norm(doc_vecs, axis=1, keepdims=True) + 1e-12)
print("📐 Embeddings shape:", doc_vecs.shape)

# ---------- 6) ask one test question ----------
CHAT_MODEL = "gpt-4o-mini"

def top_k(query, k=5):
    q = client.embeddings.create(model=EMBEDDING_MODEL, input=query).data[0].embedding
    q = np.array(q, dtype=np.float32)
    q = q / (np.linalg.norm(q) + 1e-12)
    sims = doc_vecs @ q
    idxs = np.argsort(-sims)[:k]
    return [(ids[i], texts[i], float(sims[i])) for i in idxs]

def answer_with_context(query, k=5, temperature=0.2):
    hits = top_k(query, k=k)
    context = "\n\n".join(f"- {h[1]}" for h in hits)
    prompt = (
        "You are a precise assistant. Answer strictly using the CONTEXT below. "
        "If the answer is not in the context, say you don't know.\n\n"
        f"CONTEXT:\n{context}\n\n"
        f"QUESTION: {query}\n\nANSWER:"
    )
    resp = client.chat.completions.create(
        model=CHAT_MODEL,
        temperature=temperature,
        messages=[{"role":"user","content":prompt}],
    )
    return resp.choices[0].message.content.strip(), hits

question = "Name a notable artist or album and its genre from this Top Shelf collection, plus one distinctive detail."
ans, hits = answer_with_context(question, k=5)
print("\n🧠 ANSWER:\n", ans, "\n")
print("🔎 TOP HITS:")
for hid, htxt, score in hits[:3]:
    print(f"- {hid} | score={score:.3f} | {htxt[:160]}…")

Note: you may need to restart the kernel to use updated packages.
CWD: /workspaces/suburban-disco/notebooks/notebooks
Repo root: /workspaces/suburban-disco
Env loaded: True
Using Top Shelf files: ['The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']
✅ Built 129 Top Shelf records.
Example snippet:
 Artist: 10,000 Maniacs | Title: In My Tribe | Genre: Alternative Rock
🔢 Embedding 129 docs…
📐 Embeddings shape: (129, 1536)

🧠 ANSWER:
 Notable artist: Simon & Garfunkel | Album: Parsley, Sage, Rosemary and Thyme | Genre: Folk Rock. Distinctive detail: The album features a blend of intricate harmonies and poetic lyrics. 

🔎 TOP HITS:
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-118 | score=0.401 | Artist: Various Artists | Title: Woodstock - Music from the Original Soundtrack | Genre: Rock/Folk/Blues…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-74 | score=0.397 | Artist: Simon & Garfunkel | Title: Parsley, Sage, Rosemary and Thyme | Genre: Folk Rock…
- T

In [3]:
from pathlib import Path
import os

cwd = Path.cwd()
candidates = [cwd, *cwd.parents]
repo_root = None
for p in candidates:
    if (p/".env").exists() or (p/"README.md").exists():
        repo_root = p
        break

print("CWD:", cwd)
print("Repo root:", repo_root)
if repo_root:
    print(".env exists:", (repo_root/".env").exists(), "→", repo_root/".env")
    print("data exists:", (repo_root/"data").exists(), "→", repo_root/"data")
    if (repo_root/"data").exists():
        print("data files:", [f.name for f in (repo_root/"data").iterdir()])

CWD: /workspaces/suburban-disco/notebooks/notebooks
Repo root: /workspaces/suburban-disco
.env exists: True → /workspaces/suburban-disco/.env
data exists: True → /workspaces/suburban-disco/data
data files: ['top_rated_wines.csv', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled_embeddings.jsonl', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']


Verify key being read correctly

In [2]:
import os
key = os.getenv("OPENAI_API_KEY")
print("Starts with:", key[:5])
print("Total length:", len(key))

Starts with: sk-sv
Total length: 167


In [6]:
from pathlib import Path
import os

# 1) Find repo root by walking up until we see a .env OR README.md
cwd = Path.cwd()
candidates = [cwd, *cwd.parents]
repo_root = None
for p in candidates:
    if (p/".env").exists() or (p/"README.md").exists():
        repo_root = p
        break

print("🔎 Notebook CWD:", cwd)
print("📦 Repo root:", repo_root)

# 2) Show what we have
if repo_root:
    env_path = repo_root/".env"
    data_dir = repo_root/"data"
    print("🧩 .env exists:", env_path.exists(), "→", env_path)
    print("📁 data exists:", data_dir.exists(), "→", data_dir)
    if data_dir.exists():
        print("📄 data files:", [f.name for f in data_dir.iterdir()])
else:
    print("⚠️ Could not determine repo root. We’ll assume cwd is correct.")

🔎 Notebook CWD: /workspaces/suburban-disco/notebooks/notebooks
📦 Repo root: /workspaces/suburban-disco
🧩 .env exists: True → /workspaces/suburban-disco/.env
📁 data exists: True → /workspaces/suburban-disco/data
📄 data files: ['top_rated_wines.csv', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled_embeddings.jsonl', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']


Cell A

In [7]:
%pip -q install python-dotenv

from dotenv import load_dotenv
import os

loaded = load_dotenv(repo_root/".env", override=True)
print("Env loaded:", loaded)
print("OpenAI key set:", bool(os.getenv("OPENAI_API_KEY")))

Note: you may need to restart the kernel to use updated packages.
Env loaded: True
OpenAI key set: True


In [6]:
import os
print(os.getenv("OPENAI_API_KEY")[:10])  # just the first 10 chars for privacy

sk-<sk-svc


Cell A.1 - (re)install core packages (safe to re-run)  

In [6]:
%pip -q install --upgrade numpy pandas openai tiktoken

[0mNote: you may need to restart the kernel to use updated packages.


Cell B1 — build Top Shelf records only (drop-in)

In [1]:
import pathlib, json, pandas as pd
from typing import List, Dict

records: List[Dict] = []
data_dir = repo_root / "data"

# Only load files that look like "Top Shelf"
topshelf_json = sorted([p for p in data_dir.glob("*.json") if "top" in p.name.lower() and "shelf" in p.name.lower()])
topshelf_csv  = sorted([p for p in data_dir.glob("*.csv")  if "top" in p.name.lower() and "shelf" in p.name.lower()])

def add_record(text: str, rid: str):
    text = (text or "").strip()
    if text:
        records.append({"id": rid, "text": text})

def join_item_fields(item: dict) -> str:
    txt = (item.get("text") or item.get("content") or "").strip() if isinstance(item, dict) else ""
    if txt:
        return txt
    if isinstance(item, dict):
        parts = []
        for k in sorted(item.keys()):
            v = item[k]
            if isinstance(v, (str, int, float)) and str(v).strip():
                parts.append(f"{k}: {str(v).strip()}")
        return " | ".join(parts)
    return str(item)

used_files = []

if topshelf_json:
    for jp in topshelf_json:
        used_files.append(jp.name)
        data = json.loads(jp.read_text(encoding="utf-8"))
        assert isinstance(data, list) and data, f"{jp.name} must be a non-empty JSON list"
        for i, item in enumerate(data):
            add_record(join_item_fields(item), f"{jp.name}-{i}")

elif topshelf_csv:
    for cp in topshelf_csv:
        used_files.append(cp.name)
        try:
            df2 = pd.read_csv(cp)
        except Exception:
            try:
                df2 = pd.read_csv(cp, encoding="utf-8")
            except Exception:
                df2 = pd.read_csv(cp, encoding="latin-1")

        # If you know exact column names, set them here:
        csv_columns_to_join = None  # e.g., ["artist","album","genre","description"]
        if csv_columns_to_join is None:
            likely = {"artist","album","title","description","review","genre","notes","track","year"}
            csv_columns_to_join = [c for c in df2.columns if c.lower() in likely] or \
                                  [c for c in df2.columns if df2[c].dtype == "object"]

        def row_to_text(row):
            parts = []
            for c in csv_columns_to_join:
                val = row.get(c, "")
                if pd.notna(val) and str(val).strip():
                    parts.append(f"{c}: {str(val).strip()}")
            return " | ".join(parts)

        for i, r in df2.iterrows():
            add_record(row_to_text(r), f"{cp.name}-{i}")

else:
    raise FileNotFoundError("No Top Shelf files found. Rename to include 'top' and 'shelf', or adjust the filters.")

print("Using files:", used_files)
print(f"Built {len(records)} Top Shelf records.")
print("Example:\n", (records[0]['text'][:500] if records else "NO RECORDS"))

NameError: name 'repo_root' is not defined

Cell C — embed in memory

In [None]:
%pip -q install openai numpy tiktoken

import numpy as np
from openai import OpenAI
client = OpenAI()  # uses OPENAI_API_KEY

EMBEDDING_MODEL = "text-embedding-3-small"

MAX_DOCS = min(200, len(records))
docs = records[:MAX_DOCS]
doc_texts = [d["text"] for d in docs]
doc_ids   = [d["id"]   for d in docs]

def embed_texts(texts):
    vecs = []
    for t in texts:
        e = client.embeddings.create(model=EMBEDDING_MODEL, input=t).data[0].embedding
        vecs.append(np.array(e, dtype=np.float32))
    return np.vstack(vecs)

print(f"Embedding {len(doc_texts)} docs…")
doc_vecs = embed_texts(doc_texts)
doc_vecs = doc_vecs / (np.linalg.norm(doc_vecs, axis=1, keepdims=True) + 1e-12)
print("Embeddings shape:", doc_vecs.shape)

Cell D: Ask RAG question

In [None]:
gitans, hits = answer_with_context(
    "What's a well known Jazz record in the collection?",
    k=5
)
print("ANSWER:\n", ans, "\n")
print("TOP HITS:")
for hid, htxt, score in hits[:3]:
    print(f"- {hid} | score={score:.3f} | {htxt[:160]}…")

ANSWER:
 A well known Jazz record in the collection is "Kind of Blue" by Miles Davis. 

TOP HITS:
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-24 | score=0.476 | Artist: Django Reinhardt | Title: First Recordings | Genre: Jazz/Gypsy Jazz…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-34 | score=0.450 | Artist: Ella Fitzgerald | Title: sings the George and Ira Gershwin Song Book | Genre: Jazz/Vocal Standards…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-56 | score=0.444 | Artist: Miles Davis | Title: Nefertiti | Genre: Jazz…


Testing filtered questions

In [2]:
# If you haven't already run the structured + filtered versions, paste these 2 helpers first:

def mask_by_filters(meta, year_min=None, year_max=None, genre_contains=None):
    import numpy as np, pandas as pd
    m = np.ones(len(meta), dtype=bool)
    if year_min is not None:
        m &= np.array([(r.get("year") is not None and r["year"] >= year_min) for r in meta])
    if year_max is not None:
        m &= np.array([(r.get("year") is not None and r["year"] <= year_max) for r in meta])
    if genre_contains:
        gsub = genre_contains.lower()
        m &= np.array([gsub in (r.get("genre") or "").lower() for r in meta])
    return m

def top_k_filtered(query, k=5, year_min=None, year_max=None, genre_contains=None):
    import numpy as np
    q = client.embeddings.create(model=EMBEDDING_MODEL, input=query).data[0].embedding
    q = np.array(q, dtype=np.float32)
    q = q / (np.linalg.norm(q) + 1e-12)

    mask = mask_by_filters(docs_meta if 'docs_meta' in globals() else records, year_min, year_max, genre_contains)
    if not mask.any():
        return []
    sub_vecs = doc_vecs[mask]
    sims = sub_vecs @ q
    idxs_local = np.argsort(-sims)[:k]
    global_idxs = np.flatnonzero(mask)[idxs_local]
    texts = [doc_texts[i] for i in global_idxs]
    metas = [(docs_meta if 'docs_meta' in globals() else records)[i] for i in global_idxs]
    ids = [doc_ids[i] for i in global_idxs]
    scores = [float(doc_vecs[i] @ q) for i in global_idxs]
    return list(zip(ids, texts, metas, scores))

def answer_with_context_filtered(query, k=5, year_min=None, year_max=None, genre_contains=None, temperature=0.2):
    hits = top_k_filtered(query, k=k, year_min=year_min, year_max=year_max, genre_contains=genre_contains)
    if not hits:
        return "No matching documents in the specified filters.", []
    context = "\n\n".join(f"- {h[1]}" for h in hits)
    prompt = (
        "Answer strictly using the CONTEXT below. If the answer is not in the context, say you don't know.\n\n"
        f"CONTEXT:\n{context}\n\nQUESTION: {query}\n\nANSWER:"
    )
    resp = client.chat.completions.create(
        model=CHAT_MODEL if 'CHAT_MODEL' in globals() else "gpt-4o-mini",
        temperature=temperature,
        messages=[{"role":"user","content":prompt}],
    )
    return resp.choices[0].message.content.strip(), hits

# --- Queries ---
ans1, hits1 = answer_with_context_filtered(
    "What's a standout jazz album here, and what makes it notable?",
    k=5, genre_contains="jazz"
)
print("JAZZ ANSWER:\n", ans1, "\n")
for _id, _txt, meta, score in hits1[:3]:
    print(f"- {meta.get('artist','?')} — {meta.get('title','?')} ({meta.get('year')}), [{meta.get('genre','?')}]  score={score:.3f}")

ans2, hits2 = answer_with_context_filtered(
    "Give me one notable album from the 1980s with a brief reason it's notable.",
    k=5, year_min=1980, year_max=1989
)
print("\n1980s ANSWER:\n", ans2, "\n")
for _id, _txt, meta, score in hits2[:3]:
    print(f"- {meta.get('artist','?')} — {meta.get('title','?')} ({meta.get('year')}), [{meta.get('genre','?')}]  score={score:.3f}")

JAZZ ANSWER:
 No matching documents in the specified filters. 


1980s ANSWER:
 No matching documents in the specified filters. 



In [3]:
import re, numpy as np, pandas as pd

# figure out which set we have in memory
if 'docs_meta' in globals():
    META = docs_meta
elif 'structured_records' in globals():
    META = structured_records
elif 'records' in globals():
    META = records
else:
    raise RuntimeError("No records in memory. Re-run your build records cell.")

print("Total records:", len(META))
print("Example record:", META[0])

# show available keys on meta dicts
keys = sorted({k for m in META for k in m.keys()})
print("Meta keys:", keys)

# genre distribution (lowercased)
genres = [ (m.get('genre') or '').strip().lower() for m in META ]
unique_genres = pd.Series([g for g in genres if g]).value_counts().head(20)
print("\nTop genres:\n", unique_genres)

# year stats
years = [ m.get('year') for m in META ]
years_num = [ int(y) for y in years if isinstance(y,int) or (isinstance(y,str) and y.isdigit()) ]
print("\nYear stats: count=", len(years_num), "min=", min(years_num) if years_num else None, "max=", max(years_num) if years_num else None)

# counts for our two target filters
def contains_jazz(m):
    g = (m.get('genre') or '').lower()
    return 'jazz' in g

def in_80s(m):
    y = m.get('year')
    try:
        y = int(y)
        return 1980 <= y <= 1989
    except:
        return False

print("\nMatches 'jazz' in genre:", sum(contains_jazz(m) for m in META))
print("Matches year 1980–1989:", sum(in_80s(m) for m in META))

Total records: 129
Example record: {'id': 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-0', 'text': 'Artist: 10,000 Maniacs | Title: In My Tribe | Genre: Alternative Rock'}
Meta keys: ['id', 'text']

Top genres:
 Series([], Name: count, dtype: int64)

Year stats: count= 0 min= None max= None

Matches 'jazz' in genre: 0
Matches year 1980–1989: 0


P

In [4]:
import re

assert 'records' in globals() and len(records) > 0, "No records found. Re-run the build-records cell first."

def parse_kv_text(txt: str):
    """
    Parses strings like:
      'Artist: 10,000 Maniacs | Title: In My Tribe | Genre: Alternative Rock | Year: 1987'
    into a dict {'artist': '10,000 Maniacs', 'title': 'In My Tribe', 'genre': 'Alternative Rock', 'year': 1987}
    Falls back to regex for year if no explicit 'Year:' field.
    """
    fields = {}
    # split on ' | ' chunks, then split on the first ':'
    for chunk in txt.split('|'):
        chunk = chunk.strip()
        if ':' in chunk:
            k, v = chunk.split(':', 1)
            k = k.strip().lower()
            v = v.strip()
            fields[k] = v

    # normalize keys we care about
    artist = fields.get('artist', '') or fields.get('band', '') or ''
    title  = fields.get('title', '')  or fields.get('album', '') or ''
    genre  = fields.get('genre', '')  or fields.get('style', '') or ''

    # year: prefer explicit, else extract first 4-digit 19xx/20xx
    year_str = fields.get('year', '')
    year = None
    if year_str:
        m = re.search(r'(?<!\d)(19|20)\d{2}(?!\d)', year_str)
        if m:
            year = int(m.group(0))
    if year is None:
        m2 = re.search(r'(?<!\d)(19|20)\d{2}(?!\d)', txt)
        if m2:
            year = int(m2.group(0))

    return {
        'artist': artist,
        'title': title,
        'genre': genre,
        'year': year
    }

# Build parsed metadata aligned to your existing vectors/texts if present
if 'doc_texts' in globals() and 'doc_ids' in globals():
    base_texts = doc_texts
    base_ids   = doc_ids
else:
    base_texts = [r['text'] for r in records]
    base_ids   = [r['id']   for r in records]

docs_meta = []
for rid, txt in zip(base_ids, base_texts):
    meta = parse_kv_text(txt)
    meta['id'] = rid
    meta['text'] = txt
    docs_meta.append(meta)

print("Parsed docs_meta sample:", docs_meta[0])
print("Keys present:", sorted(docs_meta[0].keys()))

# Quick stats
genres_lower = [ (m.get('genre') or '').lower() for m in docs_meta ]
years = [ m.get('year') for m in docs_meta if m.get('year') is not None ]
print("Genre examples:", sorted(set(g for g in genres_lower if g))[:10])
print("Year min/max:", (min(years), max(years)) if years else (None, None))

Parsed docs_meta sample: {'artist': '10,000 Maniacs', 'title': 'In My Tribe', 'genre': 'Alternative Rock', 'year': None, 'id': 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-0', 'text': 'Artist: 10,000 Maniacs | Title: In My Tribe | Genre: Alternative Rock'}
Keys present: ['artist', 'genre', 'id', 'text', 'title', 'year']
Genre examples: ['alternative rock', 'alternative rock/art rock', 'alternative rock/folk rock', 'alternative rock/new wave', 'blues/soul', 'christmas', 'classic rock', 'classical', 'country/folk', 'disco/funk']
Year min/max: (1958, 1994)


Q

In [5]:
import numpy as np, re

# Ensure vectors exist
assert 'doc_vecs' in globals(), "No embeddings found. Re-run your embedding cell (C / C2) first."

TEXTS = [m["text"] for m in docs_meta]
IDS   = [m["id"]   for m in docs_meta]
VECS  = doc_vecs   # from your embedding step

def infer_year_from_text(t):
    m = re.search(r'(?<!\d)(19|20)\d{2}(?!\d)', t)
    return int(m.group(0)) if m else None

def genre_matches(meta, text, want):
    g = (meta.get("genre") or "").lower()
    if want.lower() in g:
        return True
    return want.lower() in text.lower()

def year_in_range(meta, text, lo=None, hi=None):
    if lo is None and hi is None:
        return True
    y = meta.get("year")
    try:
        y = int(y)
    except:
        y = infer_year_from_text(text)
    if y is None:
        return False
    return (lo if lo is not None else -10**9) <= y <= (hi if hi is not None else 10**9)

def mask_by_filters_forgiving(year_min=None, year_max=None, genre_contains=None):
    m = np.ones(len(TEXTS), dtype=bool)
    if genre_contains:
        m &= np.array([genre_matches(docs_meta[i], TEXTS[i], genre_contains) for i in range(len(TEXTS))])
    if year_min is not None or year_max is not None:
        m &= np.array([year_in_range(docs_meta[i], TEXTS[i], year_min, year_max) for i in range(len(TEXTS))])
    return m

def top_k_filtered_forgiving(query, k=5, year_min=None, year_max=None, genre_contains=None):
    qv = client.embeddings.create(model=EMBEDDING_MODEL, input=query).data[0].embedding
    qv = np.array(qv, dtype=np.float32)
    qv = qv / (np.linalg.norm(qv) + 1e-12)

    mask = mask_by_filters_forgiving(year_min, year_max, genre_contains)
    if not mask.any():
        return []

    sub_vecs = VECS[mask]
    sims = sub_vecs @ qv
    idxs_local = np.argsort(-sims)[:k]
    global_idxs = np.flatnonzero(mask)[idxs_local]

    return [(IDS[i], TEXTS[i], docs_meta[i], float(VECS[i] @ qv)) for i in global_idxs]

def answer_with_context_filtered_forgiving(query, k=5, year_min=None, year_max=None, genre_contains=None, temperature=0.2):
    hits = top_k_filtered_forgiving(query, k, year_min, year_max, genre_contains)
    if not hits:
        return "No matching documents in the specified filters.", []
    context = "\n\n".join(f"- {h[1]}" for h in hits)
    prompt = (
        "Answer strictly using the CONTEXT below. If the answer is not in the context, say you don't know.\n\n"
        f"CONTEXT:\n{context}\n\nQUESTION: {query}\n\nANSWER:"
    )
    resp = client.chat.completions.create(
        model=CHAT_MODEL if 'CHAT_MODEL' in globals() else "gpt-4o-mini",
        temperature=temperature,
        messages=[{"role":"user","content":prompt}],
    )
    return resp.choices[0].message.content.strip(), hits

R

In [6]:
ans1, hits1 = answer_with_context_filtered_forgiving(
    "What's a standout jazz album here, and what makes it notable?",
    k=5, genre_contains="jazz"
)
print("JAZZ ANSWER:\n", ans1, "\n")
for _id, _txt, meta, score in hits1[:3]:
    print(f"- {meta.get('artist','?')} — {meta.get('title','?')} ({meta.get('year')}), [{meta.get('genre','?')}]  score={score:.3f}")

ans2, hits2 = answer_with_context_filtered_forgiving(
    "Give me one notable album from the 1980s with a brief reason it's notable.",
    k=5, year_min=1980, year_max=1989
)
print("\n1980s ANSWER:\n", ans2, "\n")
for _id, _txt, meta, score in hits2[:3]:
    print(f"- {meta.get('artist','?')} — {meta.get('title','?')} ({meta.get('year')}), [{meta.get('genre','?')}]  score={score:.3f}")

JAZZ ANSWER:
 I don't know. 

- The Dave Brubeck Quartet — Red Hot and Cool (None), [Jazz]  score=0.443
- Miles Davis — Kind of Blue (None), [Jazz]  score=0.437
- Dave Brubeck Quartet — Time Out (None), [Jazz]  score=0.431

1980s ANSWER:
 I don't know. 

- Cheap Trick — The Epic Archive Vol. 2 (1980-1983) (1980), [Rock]  score=0.348
- Cheap Trick — The Epic Archive Vol. 3 (1984-1992) (1984), [Rock]  score=0.337


clean, reliable queries without needing descriptions

In [7]:
import pandas as pd, numpy as np, re

# Build a small DataFrame from the parsed metadata if you have docs_meta; else, parse records again.
def to_df_from_docs_meta(docs_meta):
    rows = []
    for m in docs_meta:
        rows.append({
            "artist": m.get("artist") or "",
            "title":  m.get("title") or "",
            "genre":  m.get("genre") or "",
            "year":   m.get("year"),
        })
    df = pd.DataFrame(rows)
    # clean year
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    return df

def parse_from_records(records):
    # fallback parser if docs_meta doesn't exist
    import re
    def parse_kv_text(txt):
        fields = {}
        for chunk in txt.split("|"):
            chunk = chunk.strip()
            if ":" in chunk:
                k,v = chunk.split(":",1)
                fields[k.strip().lower()] = v.strip()
        artist = fields.get("artist","")
        title  = fields.get("title","")
        genre  = fields.get("genre","")
        # year inference
        y = fields.get("year","")
        m = re.search(r'(?<!\d)(19|20)\d{2}(?!\d)', y or txt)
        year = int(m.group(0)) if m else None
        return {"artist":artist, "title":title, "genre":genre, "year":year}
    rows = [parse_kv_text(r["text"]) for r in records]
    df = pd.DataFrame(rows)
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    return df

if 'docs_meta' in globals():
    lib = to_df_from_docs_meta(docs_meta)
else:
    lib = parse_from_records(records)

print("Rows:", len(lib))
display(lib.head(5))

def list_by_genre(genre_sub, limit=10):
    mask = lib["genre"].str.contains(genre_sub, case=False, na=False)
    return lib[mask].dropna(how="all").head(limit)

def list_by_year_range(ymin=None, ymax=None, limit=10):
    df = lib.copy()
    if ymin is not None:
        df = df[df["year"].notna() & (df["year"] >= ymin)]
    if ymax is not None:
        df = df[df["year"].notna() & (df["year"] <= ymax)]
    return df.head(limit)

def search_artist(substr, limit=10):
    return lib[lib["artist"].str.contains(substr, case=False, na=False)].head(limit)

# Examples (adjust freely)
print("\n— Jazz entries (first 10) —")
display(list_by_genre("jazz", 10))

print("\n— 1980s entries (first 10) —")
display(list_by_year_range(1980, 1989, 10))

print("\n— Search artist contains 'miles' —")
display(search_artist("miles", 10))

Rows: 129


Unnamed: 0,artist,title,genre,year
0,"10,000 Maniacs",In My Tribe,Alternative Rock,
1,Amy Winehouse,Back to Black,Soul/Jazz,
2,Average White Band,Volume IIIV,Funk/Soul,
3,Billy Joel,52nd Street,Pop Rock,
4,Black Pumas,,Soul/Funk,



— Jazz entries (first 10) —


Unnamed: 0,artist,title,genre,year
1,Amy Winehouse,Back to Black,Soul/Jazz,
17,Carmen McRae-Dave Brubeck,"Take Five, Recorded Live at Basin Street East",Jazz,
24,Django Reinhardt,First Recordings,Jazz/Gypsy Jazz,
34,Ella Fitzgerald,sings the George and Ira Gershwin Song Book,Jazz/Vocal Standards,
53,Madeleine Peyroux,Careless Love,Jazz/Blues,
56,Miles Davis,Nefertiti,Jazz,
58,Nina Simone,Sings Her Greatest Hits,Jazz /Blues,
66,Ray Charles,The Great Ray Charles,Soul/Jazz,
70,Rosa Passos & Ron Carter,Entre Amigos,Jazz Bossa Nova,
75,Stan Getz,Cafe Montmartre,Jazz,



— 1980s entries (first 10) —


Unnamed: 0,artist,title,genre,year
19,Cheap Trick,The Epic Archive Vol. 2 (1980-1983),Rock,1980
20,Cheap Trick,The Epic Archive Vol. 3 (1984-1992),Rock,1984



— Search artist contains 'miles' —


Unnamed: 0,artist,title,genre,year
56,Miles Davis,Nefertiti,Jazz,
128,Miles Davis,Kind of Blue,Jazz,
