# 📘 0_parsing_mechanics.ipynb

**Goal:** Load, clean, and analyze structured MTG mechanics for use as multi-label targets in a card prediction ML pipeline.

In [None]:
# Step 0: imports
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
# ──────────────────────────────────────────────────────────────
# Step 1: Load canonical mechanics list from file
# Source: generate_full_mechanics_list.py
# This is the full, deduplicated and labeled mechanic dataset.
# ──────────────────────────────────────────────────────────────

sns.set(style="whitegrid")

mechanics_path = Path("../data/static/ml_ready_mechanics.json")
with open(mechanics_path) as f:
    mechanics = json.load(f)

print(f"✅ Loaded {len(mechanics)} mechanics from: {mechanics_path}")


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 2: Remove non-essential fields (like 'cards')
# These are examples only; we rely on external card mappings.
# ──────────────────────────────────────────────────────────────

for m in mechanics:
    m.pop("cards", None)

print("🧹 Removed 'cards' field from all mechanic entries.")


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 3: Convert mechanic list to a DataFrame for exploration
# Enables filtering, grouping, and plotting with ease.
# ──────────────────────────────────────────────────────────────

df = pd.DataFrame(mechanics)
df.head()


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 4: Visualize how mechanics are distributed by type
# Helps you understand how many keyword vs. glossary labels exist.
# ──────────────────────────────────────────────────────────────

plt.figure(figsize=(8, 5))
sns.countplot(y="type", data=df, order=df["type"].value_counts().index)
plt.title("Mechanic Types")
plt.xlabel("Count")
plt.ylabel("Type")
plt.tight_layout()
plt.show()


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 5: Plot distribution of how many cards reference each mechanic
# This highlights class imbalance and rare mechanics.
# ──────────────────────────────────────────────────────────────

plt.figure(figsize=(10, 5))
sns.histplot(df["card_count"], bins=50, log_scale=(False, True))
plt.title("Mechanic Card Count Distribution")
plt.xlabel("# of Cards Tagged With Mechanic")
plt.ylabel("Frequency (log scale)")
plt.tight_layout()
plt.show()


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 6: Flag low-sample mechanics for later filtering
# These are mechanics used in 3 or fewer cards — often too sparse for ML.
# ──────────────────────────────────────────────────────────────

rare_thresh = 3
rare = df[df["card_count"] <= rare_thresh]
print(f"🚨 {len(rare)} mechanics have ≤ {rare_thresh} card matches.")
rare[["name", "type", "card_count"]].sort_values(by="card_count").head(10)


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 7: Embed mechanic definitions using Sentence-BERT
# These can later be used for semantic clustering and soft-matching.
# ──────────────────────────────────────────────────────────────

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

definitions = df["definition"].fillna("").tolist()
definition_embeddings = model.encode(definitions, show_progress_bar=True)


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 8: Save the embeddings to disk for UMAP, clustering, or matching
# ──────────────────────────────────────────────────────────────

import numpy as np
np.save("../data/processed/mechanic_definition_embeddings.npy", definition_embeddings)
print("💾 Saved mechanic definition embeddings.")


In [None]:
# ──────────────────────────────────────────────────────────────
# Step 9: Create a mechanic → index map for ML label encoding
# Used when training classifiers or formatting multi-label outputs.
# ──────────────────────────────────────────────────────────────

mechanic_to_idx = {row["name"]: idx for idx, row in df.iterrows()}
idx_to_mechanic = {idx: name for name, idx in mechanic_to_idx.items()}

with open("../data/processed/mechanic_label_map.json", "w") as f:
    json.dump(mechanic_to_idx, f, indent=2)

print(f"📘 Saved {len(mechanic_to_idx)} mechanic labels.")
