# 🌐 3_umap_visualization.ipynb

Reduces oracle text embeddings to 2D using UMAP and visualizes clusters by card metadata.

In [None]:
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
sns.set(style="whitegrid")


In [None]:
# === Load embeddings and metadata ===
embed_path = Path("../data/processed/text_embeddings.npy")
cards_path = Path("../data/processed/enriched_cards.csv")

if not embed_path.exists():
    raise FileNotFoundError(f"❌ Could not find text embeddings at {embed_path}")
if not cards_path.exists():
    raise FileNotFoundError(f"❌ Could not find enriched card data at {cards_path}")

embeddings = np.load(embed_path)
df = pd.read_csv(cards_path)

print(f"✅ Loaded {embeddings.shape[0]} embeddings and {len(df)} cards")


In [None]:
# === Run UMAP ===
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

df["umap_x"] = embedding_2d[:, 0]
df["umap_y"] = embedding_2d[:, 1]

print("✅ UMAP reduction complete")


In [None]:
# === Plot UMAP by card color ===
plt.figure(figsize=(10, 8))
palette = sns.color_palette("hsv", 8)

def color_to_str(color_list):
    if isinstance(color_list, str):
        try:
            parsed = eval(color_list)
            return "".join(sorted(parsed)) if parsed else "C"  # C = Colorless
        except:
            return "C"
    return "C"

df["color_str"] = df["colors"].apply(color_to_str)
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="color_str", palette=palette, s=10, linewidth=0)
plt.title("UMAP Projection of Oracle Text Embeddings (by Color)")
plt.legend(title="Color Identity", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_color.png", dpi=300)
plt.show()


In [None]:
# === Add mechanic count for clustering visualization ===
df["mechanic_count"] = df["parsed_mechanics"].apply(lambda x: len(eval(x)) if isinstance(x, str) else len(x) if isinstance(x, list) else 0)


In [None]:
# === UMAP Cluster by Color Identity ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="color_str", s=10, linewidth=0, palette="tab10" if df["color_str"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Color Identity")
plt.legend(title="Color Identity", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_color_identity.png", dpi=300)
plt.show()


In [None]:
# === UMAP Cluster by Card Type (type_line) ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="type_line", s=10, linewidth=0, palette="tab10" if df["type_line"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Card Type (type_line)")
plt.legend(title="Card Type (type_line)", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_card_type.png", dpi=300)
plt.show()


In [None]:
# === UMAP Cluster by Rarity ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="rarity", s=10, linewidth=0, palette="tab10" if df["rarity"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Rarity")
plt.legend(title="Rarity", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_rarity.png", dpi=300)
plt.show()


In [None]:
# === UMAP Cluster by Set ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="set", s=10, linewidth=0, palette="tab10" if df["set"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Set")
plt.legend(title="Set", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_set.png", dpi=300)
plt.show()


In [None]:
# === UMAP Cluster by Converted Mana Cost (CMC) ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="cmc", s=10, linewidth=0, palette="tab10" if df["cmc"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Converted Mana Cost (CMC)")
plt.legend(title="Converted Mana Cost (CMC)", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_cmc.png", dpi=300)
plt.show()


In [None]:
# === UMAP Cluster by Mechanic Count ===
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x="umap_x", y="umap_y", hue="mechanic_count", s=10, linewidth=0, palette="tab10" if df["mechanic_count"].nunique() <= 10 else "husl")
plt.title("UMAP Projection by Mechanic Count")
plt.legend(title="Mechanic Count", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"../visualizations/umap_by_mechanic_count.png", dpi=300)
plt.show()
