# 🧠 2_text_embeddings.ipynb

Embeds oracle text into 384-dim vectors using a pretrained Sentence Transformer.

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from pathlib import Path


In [None]:
# === Load enriched card data ===
enriched_path = Path("../data/processed/enriched_cards.csv")

if not enriched_path.exists():
    raise FileNotFoundError(f"❌ Could not find enriched dataset at {enriched_path}")

df = pd.read_csv(enriched_path)
texts = df["oracle_text"].fillna("").tolist()

print(f"✅ Loaded {len(texts)} oracle text entries.")


In [None]:
# === Load transformer model and encode ===
model = SentenceTransformer("all-MiniLM-L6-v2")

print("⚙️ Generating embeddings (this may take a few minutes)...")
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings)
print("✅ Embeddings shape:", embeddings.shape)


In [None]:
# === Save embeddings ===
output_path = Path("../data/processed/text_embeddings.npy")
np.save(output_path, embeddings)
print(f"✅ Saved embeddings to {output_path}")
