One-click “Reset & Run” master cell

In [3]:
# --- MASTER CELL: safe re-run after any restart ---
# It will:
# 1) Detect repo_root, 2) load .env, 3) pick only Top Shelf files,
# 4) build records, 5) embed in memory, 6) answer one test query.

%pip -q install --upgrade numpy pandas openai python-dotenv tiktoken

from pathlib import Path
import os, json, pandas as pd, numpy as np
from dotenv import load_dotenv
from typing import List, Dict

# ---------- 1) detect repo root ----------
def detect_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p/".env").exists() or (p/"README.md").exists() or (p/"data").exists():
            return p
    return start

CWD = Path.cwd()
repo_root = detect_repo_root(CWD)
print("CWD:", CWD)
print("Repo root:", repo_root)

# ---------- 2) load .env ----------
loaded = load_dotenv(repo_root / ".env", override=True)
print("Env loaded:", loaded)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise SystemExit("❌ OPENAI_API_KEY not found. Put it in repo_root/.env then re-run this cell.")

# ---------- 3) collect ONLY Top Shelf files ----------
data_dir = repo_root / "data"
if not data_dir.exists():
    raise SystemExit(f"❌ data folder not found at {data_dir}. Create it and add your files.")

topshelf_csv  = sorted([p for p in data_dir.glob("*.csv")  if "top" in p.name.lower() and "shelf" in p.name.lower()])
topshelf_json = sorted([p for p in data_dir.glob("*.json") if "top" in p.name.lower() and "shelf" in p.name.lower()])

if not (topshelf_csv or topshelf_json):
    print("⚠️ No Top Shelf files matched. Showing all data files instead:")
    print([f.name for f in data_dir.iterdir()])
    raise SystemExit("Rename your files to include 'top' and 'shelf', or adjust the filters in this cell.")

print("Using Top Shelf files:", [p.name for p in (topshelf_json + topshelf_csv)])

# ---------- 4) build records ----------
records: List[Dict] = []

def add_record(text: str, rid: str):
    text = (text or "").strip()
    if text:
        records.append({"id": rid, "text": text})

def join_item_fields(item: dict) -> str:
    # Prefer explicit text/content
    if isinstance(item, dict):
        txt = (item.get("text") or item.get("content") or "").strip()
        if txt:
            return txt
        # Otherwise join string-like fields
        parts = []
        for k in sorted(item.keys()):
            v = item[k]
            if isinstance(v, (str, int, float)) and str(v).strip():
                parts.append(f"{k}: {str(v).strip()}")
        return " | ".join(parts)
    return str(item)

# JSON first (if present)
for jp in topshelf_json:
    data = json.loads(jp.read_text(encoding="utf-8"))
    if not isinstance(data, list) or not data:
        continue
    for i, item in enumerate(data):
        add_record(join_item_fields(item), f"{jp.name}-{i}")

# Then CSVs
for cp in topshelf_csv:
    try:
        df = pd.read_csv(cp)
    except Exception:
        try:
            df = pd.read_csv(cp, encoding="utf-8")
        except Exception:
            df = pd.read_csv(cp, encoding="latin-1")
    # Choose columns (auto if not set)
    likely = {"artist","album","title","description","review","genre","notes","track","year"}
    csv_columns_to_join = [c for c in df.columns if c.lower() in likely] or [c for c in df.columns if df[c].dtype=="object"]

    def row_to_text(row):
        parts = []
        for c in csv_columns_to_join:
            val = row.get(c, "")
            if pd.notna(val) and str(val).strip():
                parts.append(f"{c}: {str(val).strip()}")
        return " | ".join(parts)

    for i, r in df.iterrows():
        add_record(row_to_text(r), f"{cp.name}-{i}")

if not records:
    raise SystemExit("❌ No records built from Top Shelf files. Check file contents/columns.")

print(f"✅ Built {len(records)} Top Shelf records.")
print("Example snippet:\n", records[0]["text"][:400])

# ---------- 5) embed in memory ----------
from openai import OpenAI
client = OpenAI()  # reads OPENAI_API_KEY from env

EMBEDDING_MODEL = "text-embedding-3-small"

MAX_DOCS = min(200, len(records))   # keep first run fast
docs = records[:MAX_DOCS]
texts = [d["text"] for d in docs]
ids   = [d["id"]   for d in docs]

def embed_texts(texts):
    vecs = []
    for t in texts:
        e = client.embeddings.create(model=EMBEDDING_MODEL, input=t).data[0].embedding
        vecs.append(np.array(e, dtype=np.float32))
    return np.vstack(vecs)

print(f"🔢 Embedding {len(texts)} docs…")
doc_vecs = embed_texts(texts)
doc_vecs = doc_vecs / (np.linalg.norm(doc_vecs, axis=1, keepdims=True) + 1e-12)
print("📐 Embeddings shape:", doc_vecs.shape)

# ---------- 6) ask one test question ----------
CHAT_MODEL = "gpt-4o-mini"

def top_k(query, k=5):
    q = client.embeddings.create(model=EMBEDDING_MODEL, input=query).data[0].embedding
    q = np.array(q, dtype=np.float32)
    q = q / (np.linalg.norm(q) + 1e-12)
    sims = doc_vecs @ q
    idxs = np.argsort(-sims)[:k]
    return [(ids[i], texts[i], float(sims[i])) for i in idxs]

def answer_with_context(query, k=5, temperature=0.2):
    hits = top_k(query, k=k)
    context = "\n\n".join(f"- {h[1]}" for h in hits)
    prompt = (
        "You are a precise assistant. Answer strictly using the CONTEXT below. "
        "If the answer is not in the context, say you don't know.\n\n"
        f"CONTEXT:\n{context}\n\n"
        f"QUESTION: {query}\n\nANSWER:"
    )
    resp = client.chat.completions.create(
        model=CHAT_MODEL,
        temperature=temperature,
        messages=[{"role":"user","content":prompt}],
    )
    return resp.choices[0].message.content.strip(), hits

question = "Name a notable artist or album and its genre from this Top Shelf collection, plus one distinctive detail."
ans, hits = answer_with_context(question, k=5)
print("\n🧠 ANSWER:\n", ans, "\n")
print("🔎 TOP HITS:")
for hid, htxt, score in hits[:3]:
    print(f"- {hid} | score={score:.3f} | {htxt[:160]}…")

Note: you may need to restart the kernel to use updated packages.
CWD: /workspaces/suburban-disco/notebooks/notebooks
Repo root: /workspaces/suburban-disco
Env loaded: True
Using Top Shelf files: ['The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']
✅ Built 129 Top Shelf records.
Example snippet:
 Artist: 10,000 Maniacs | Title: In My Tribe | Genre: Alternative Rock
🔢 Embedding 129 docs…
📐 Embeddings shape: (129, 1536)

🧠 ANSWER:
 Notable artist: Simon & Garfunkel | Album: Parsley, Sage, Rosemary and Thyme | Genre: Folk Rock. Distinctive detail: This album features a blend of intricate harmonies and poetic lyrics. 

🔎 TOP HITS:
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-118 | score=0.401 | Artist: Various Artists | Title: Woodstock - Music from the Original Soundtrack | Genre: Rock/Folk/Blues…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-74 | score=0.397 | Artist: Simon & Garfunkel | Title: Parsley, Sage, Rosemary and Thyme | Genre: Folk Rock…
- 

In [3]:
from pathlib import Path
import os

cwd = Path.cwd()
candidates = [cwd, *cwd.parents]
repo_root = None
for p in candidates:
    if (p/".env").exists() or (p/"README.md").exists():
        repo_root = p
        break

print("CWD:", cwd)
print("Repo root:", repo_root)
if repo_root:
    print(".env exists:", (repo_root/".env").exists(), "→", repo_root/".env")
    print("data exists:", (repo_root/"data").exists(), "→", repo_root/"data")
    if (repo_root/"data").exists():
        print("data files:", [f.name for f in (repo_root/"data").iterdir()])

CWD: /workspaces/suburban-disco/notebooks/notebooks
Repo root: /workspaces/suburban-disco
.env exists: True → /workspaces/suburban-disco/.env
data exists: True → /workspaces/suburban-disco/data
data files: ['top_rated_wines.csv', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled_embeddings.jsonl', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']


Verify key being read correctly

In [2]:
import os
key = os.getenv("OPENAI_API_KEY")
print("Starts with:", key[:5])
print("Total length:", len(key))

Starts with: sk-sv
Total length: 167


In [6]:
from pathlib import Path
import os

# 1) Find repo root by walking up until we see a .env OR README.md
cwd = Path.cwd()
candidates = [cwd, *cwd.parents]
repo_root = None
for p in candidates:
    if (p/".env").exists() or (p/"README.md").exists():
        repo_root = p
        break

print("🔎 Notebook CWD:", cwd)
print("📦 Repo root:", repo_root)

# 2) Show what we have
if repo_root:
    env_path = repo_root/".env"
    data_dir = repo_root/"data"
    print("🧩 .env exists:", env_path.exists(), "→", env_path)
    print("📁 data exists:", data_dir.exists(), "→", data_dir)
    if data_dir.exists():
        print("📄 data files:", [f.name for f in data_dir.iterdir()])
else:
    print("⚠️ Could not determine repo root. We’ll assume cwd is correct.")

🔎 Notebook CWD: /workspaces/suburban-disco/notebooks/notebooks
📦 Repo root: /workspaces/suburban-disco
🧩 .env exists: True → /workspaces/suburban-disco/.env
📁 data exists: True → /workspaces/suburban-disco/data
📄 data files: ['top_rated_wines.csv', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled_embeddings.jsonl', 'The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv']


Cell A

In [7]:
%pip -q install python-dotenv

from dotenv import load_dotenv
import os

loaded = load_dotenv(repo_root/".env", override=True)
print("Env loaded:", loaded)
print("OpenAI key set:", bool(os.getenv("OPENAI_API_KEY")))

Note: you may need to restart the kernel to use updated packages.
Env loaded: True
OpenAI key set: True


In [6]:
import os
print(os.getenv("OPENAI_API_KEY")[:10])  # just the first 10 chars for privacy

sk-<sk-svc


Cell A.1 - (re)install core packages (safe to re-run)  

In [6]:
%pip -q install --upgrade numpy pandas openai tiktoken

[0mNote: you may need to restart the kernel to use updated packages.


Cell B1 — build Top Shelf records only (drop-in)

In [1]:
import pathlib, json, pandas as pd
from typing import List, Dict

records: List[Dict] = []
data_dir = repo_root / "data"

# Only load files that look like "Top Shelf"
topshelf_json = sorted([p for p in data_dir.glob("*.json") if "top" in p.name.lower() and "shelf" in p.name.lower()])
topshelf_csv  = sorted([p for p in data_dir.glob("*.csv")  if "top" in p.name.lower() and "shelf" in p.name.lower()])

def add_record(text: str, rid: str):
    text = (text or "").strip()
    if text:
        records.append({"id": rid, "text": text})

def join_item_fields(item: dict) -> str:
    txt = (item.get("text") or item.get("content") or "").strip() if isinstance(item, dict) else ""
    if txt:
        return txt
    if isinstance(item, dict):
        parts = []
        for k in sorted(item.keys()):
            v = item[k]
            if isinstance(v, (str, int, float)) and str(v).strip():
                parts.append(f"{k}: {str(v).strip()}")
        return " | ".join(parts)
    return str(item)

used_files = []

if topshelf_json:
    for jp in topshelf_json:
        used_files.append(jp.name)
        data = json.loads(jp.read_text(encoding="utf-8"))
        assert isinstance(data, list) and data, f"{jp.name} must be a non-empty JSON list"
        for i, item in enumerate(data):
            add_record(join_item_fields(item), f"{jp.name}-{i}")

elif topshelf_csv:
    for cp in topshelf_csv:
        used_files.append(cp.name)
        try:
            df2 = pd.read_csv(cp)
        except Exception:
            try:
                df2 = pd.read_csv(cp, encoding="utf-8")
            except Exception:
                df2 = pd.read_csv(cp, encoding="latin-1")

        # If you know exact column names, set them here:
        csv_columns_to_join = None  # e.g., ["artist","album","genre","description"]
        if csv_columns_to_join is None:
            likely = {"artist","album","title","description","review","genre","notes","track","year"}
            csv_columns_to_join = [c for c in df2.columns if c.lower() in likely] or \
                                  [c for c in df2.columns if df2[c].dtype == "object"]

        def row_to_text(row):
            parts = []
            for c in csv_columns_to_join:
                val = row.get(c, "")
                if pd.notna(val) and str(val).strip():
                    parts.append(f"{c}: {str(val).strip()}")
            return " | ".join(parts)

        for i, r in df2.iterrows():
            add_record(row_to_text(r), f"{cp.name}-{i}")

else:
    raise FileNotFoundError("No Top Shelf files found. Rename to include 'top' and 'shelf', or adjust the filters.")

print("Using files:", used_files)
print(f"Built {len(records)} Top Shelf records.")
print("Example:\n", (records[0]['text'][:500] if records else "NO RECORDS"))

NameError: name 'repo_root' is not defined

Cell C — embed in memory

In [None]:
%pip -q install openai numpy tiktoken

import numpy as np
from openai import OpenAI
client = OpenAI()  # uses OPENAI_API_KEY

EMBEDDING_MODEL = "text-embedding-3-small"

MAX_DOCS = min(200, len(records))
docs = records[:MAX_DOCS]
doc_texts = [d["text"] for d in docs]
doc_ids   = [d["id"]   for d in docs]

def embed_texts(texts):
    vecs = []
    for t in texts:
        e = client.embeddings.create(model=EMBEDDING_MODEL, input=t).data[0].embedding
        vecs.append(np.array(e, dtype=np.float32))
    return np.vstack(vecs)

print(f"Embedding {len(doc_texts)} docs…")
doc_vecs = embed_texts(doc_texts)
doc_vecs = doc_vecs / (np.linalg.norm(doc_vecs, axis=1, keepdims=True) + 1e-12)
print("Embeddings shape:", doc_vecs.shape)

Cell D: Ask RAG question

In [12]:
ans, hits = answer_with_context(
    "What's a well known Jazz record in the collection?",
    k=5
)
print("ANSWER:\n", ans, "\n")
print("TOP HITS:")
for hid, htxt, score in hits[:3]:
    print(f"- {hid} | score={score:.3f} | {htxt[:160]}…")

ANSWER:
 A well known Jazz record in the collection is "Kind of Blue" by Miles Davis. 

TOP HITS:
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-24 | score=0.476 | Artist: Django Reinhardt | Title: First Recordings | Genre: Jazz/Gypsy Jazz…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-34 | score=0.450 | Artist: Ella Fitzgerald | Title: sings the George and Ira Gershwin Song Book | Genre: Jazz/Vocal Standards…
- The_Top_Shelf_Collection_RAG_with_artist_years_filled.csv-56 | score=0.444 | Artist: Miles Davis | Title: Nefertiti | Genre: Jazz…
