# Embedding oracle text via word2vec
Credit to https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/


In [61]:
# Load the data using msgspec and pandas
import msgspec
import datetime
import pandas as pd
from pathlib import Path
import scipy
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

parquet_file = Path("../datasets/default-cards-20240405090559.parquet")

if not parquet_file.exists():
    # Define the Card object to specify the different fields
    class Card(msgspec.Struct, dict=True):
        name: str | None = None
        mana_cost: str | None = None
        set: str | None = None
        cmc: float | None = None
        power: str | None = None
        toughness: str | None = None
        colors: list[str] | None = None
        oracle_text: str | None = None
        keywords: list[str] | None = None
        type_line: str | None = None
        released_at: datetime.date | None = None
        reprint: bool | None = None
        # legalities: dict[str, str] | None = None

    # Decode json into Card objects
    with open("../data/default-cards-20240405090559.json", "rb") as json_file:
        dataset = msgspec.json.decode(json_file.read(), type=list[Card])

    # Load into to a pandas dataframe
    data = pd.DataFrame([msgspec.structs.asdict(card) for card in dataset])
    data.to_parquet(parquet_file)
else:
    data = pd.read_parquet(parquet_file)

# Remove reprints
data = data[data["reprint"] == False]

# Clean the oracle text by replacing cardname, removing brackets
data["oracle_text_clean"] = data.apply(lambda row: row["oracle_text"].replace(row["name"], "CARDNAME") if row["oracle_text"] else None, axis=1)
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace(r"[\(].*?[\)]", "", regex=True)
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("{", "mana_")
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("}", " ")
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("\n", " ")
data["oracle_text_clean"] = data["oracle_text_clean"].replace("", None)


In [62]:
# Tokenize the oracle text using word_tokenize from nltk
data["oracle_text_tokenized"] = data["oracle_text_clean"].dropna().apply(lambda sentence: [word.lower() for word in word_tokenize(sentence)])


In [63]:
# Show the original oracle text, cleaned version, and tokenized version
data[["oracle_text", "oracle_text_clean", "oracle_text_tokenized"]].head(10)


Unnamed: 0,oracle_text,oracle_text_clean,oracle_text_tokenized
0,All Sliver creatures have double strike.,All Sliver creatures have double strike.,"[all, sliver, creatures, have, double, strike, .]"
1,"When Kor Outfitter enters the battlefield, you...","When CARDNAME enters the battlefield, you may ...","[when, cardname, enters, the, battlefield, ,, ..."
3,Flying\nWhen Siren Lookout enters the battlefi...,"Flying When CARDNAME enters the battlefield, i...","[flying, when, cardname, enters, the, battlefi..."
5,Paradox — Draw a card for each spell you've ca...,Paradox — Draw a card for each spell you've ca...,"[paradox, —, draw, a, card, for, each, spell, ..."
6,,,
7,"When Venerable Knight dies, put a +1/+1 counte...","When CARDNAME dies, put a +1/+1 counter on tar...","[when, cardname, dies, ,, put, a, +1/+1, count..."
9,"Whenever you draw your second card each turn, ...","Whenever you draw your second card each turn, ...","[whenever, you, draw, your, second, card, each..."
12,Kicker {R} (You may pay an additional {R} as y...,Kicker mana_R Flying When CARDNAME enters th...,"[kicker, mana_r, flying, when, cardname, enter..."
15,Defender (This creature can't attack.)\n{3}: D...,Defender mana_3 : Destroy CARDNAME and target...,"[defender, mana_3, :, destroy, cardname, and, ..."
18,Reach\nWhen Whiptongue Hydra enters the battle...,"Reach When CARDNAME enters the battlefield, de...","[reach, when, cardname, enters, the, battlefie..."


In [64]:
# Train the Word2Vec model on the tokenized text
model = Word2Vec(data["oracle_text_tokenized"].dropna().to_list(), min_count=1, vector_size=100, window=5)


In [65]:
# Get the mean vector of each card.
data["oracle_text_mean_vector"] = data["oracle_text_tokenized"].dropna().apply(model.wv.get_mean_vector)


In [66]:
# Define k-NN retrieval function
def get_similar_cards(i: int, n: int):
    data["score"] = data["oracle_text_mean_vector"].dropna().apply(lambda vector: scipy.spatial.distance.cosine(vector, data["oracle_text_mean_vector"].iloc[i]))
    print(data.sort_values("score").head(n))
    data.drop(columns="score")


In [67]:
# Example 
get_similar_cards(5, 5)


                     name mana_cost  set  cmc power toughness colors  \
7        Venerable Knight       {W}  eld  1.0     2         1    [W]   
10252      Lawless Broker    {2}{B}  kld  3.0     3         2    [B]   
68257  Guul Draz Mucklord    {2}{B}  znr  3.0     2         3    [B]   
68753  Sparring Construct       {1}  dom  1.0     1         1     []   
42928    Spinal Centipede    {2}{B}  grn  3.0     3         2    [B]   

                                             oracle_text keywords  \
7      When Venerable Knight dies, put a +1/+1 counte...       []   
10252  When Lawless Broker dies, put a +1/+1 counter ...       []   
68257  When Guul Draz Mucklord dies, put a +1/+1 coun...       []   
68753  When Sparring Construct dies, put a +1/+1 coun...       []   
42928  When Spinal Centipede dies, put a +1/+1 counte...       []   

                           type_line released_at  reprint  \
7            Creature — Human Knight  2019-10-04    False   
10252    Creature — Aetherborn