# Embedding oracle text via word2vec


In [26]:
# Load the data using msgspec and pandas
import msgspec
import datetime
import pandas as pd
from pathlib import Path
import scipy

parquet_file = Path("../datasets/default-cards-20240405090559.parquet")

if not parquet_file.exists():
    # Define the Card object to specify the different fields
    class Card(msgspec.Struct, dict=True):
        name: str | None = None
        mana_cost: str | None = None
        set: str | None = None
        cmc: float | None = None
        power: str | None = None
        toughness: str | None = None
        colors: list[str] | None = None
        oracle_text: str | None = None
        keywords: list[str] | None = None
        type_line: str | None = None
        released_at: datetime.date | None = None
        reprint: bool | None = None
        # legalities: dict[str, str] | None = None

    # Decode json into Card objects
    with open("../data/default-cards-20240405090559.json", "rb") as json_file:
        dataset = msgspec.json.decode(json_file.read(), type=list[Card])

    # Load into to a pandas dataframe
    data = pd.DataFrame([msgspec.structs.asdict(card) for card in dataset])
    data.to_parquet(parquet_file)
else:
    data = pd.read_parquet(parquet_file)

# Remove reprints
data = data[data["reprint"] == False]
data["oracle_text_clean"] = data.apply(lambda row: row["oracle_text"].replace(row["name"], "CARDNAME") if row["oracle_text"] else None, axis=1)
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace(r"[\(].*?[\)]", "", regex=True)
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("{", "mana_")
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("}", " ")
data["oracle_text_clean"] = data["oracle_text_clean"].str.replace("\n", " ")
data["oracle_text_clean"] = data["oracle_text_clean"].replace("", None)


In [27]:
data[data["name"] == "Fury Sliver"]

Unnamed: 0,name,mana_cost,set,cmc,power,toughness,colors,oracle_text,keywords,type_line,released_at,reprint,oracle_text_clean
0,Fury Sliver,{5}{R},tsp,6.0,3,3,[R],All Sliver creatures have double strike.,[],Creature — Sliver,2006-10-06,False,All Sliver creatures have double strike.


In [28]:
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize # sent_tokenize, 

data["oracle_text_tokenized"] = data["oracle_text_clean"].dropna().apply(lambda sentence: [word.lower() for word in word_tokenize(sentence)])

# data_tokenized = []
# for sentence in data["oracle_text_clean"]:
#     if sentence is None:
#         continue
#     temp = []
#     for word in word_tokenize(sentence):
#         temp.append(word.lower())
    
#     data_tokenized.append(temp)

# data_tokenized

In [29]:
data[["oracle_text", "oracle_text_clean", "oracle_text_tokenized"]].head(10)


Unnamed: 0,oracle_text,oracle_text_clean,oracle_text_tokenized
0,All Sliver creatures have double strike.,All Sliver creatures have double strike.,"[all, sliver, creatures, have, double, strike, .]"
1,"When Kor Outfitter enters the battlefield, you...","When CARDNAME enters the battlefield, you may ...","[when, cardname, enters, the, battlefield, ,, ..."
3,Flying\nWhen Siren Lookout enters the battlefi...,"Flying When CARDNAME enters the battlefield, i...","[flying, when, cardname, enters, the, battlefi..."
5,Paradox — Draw a card for each spell you've ca...,Paradox — Draw a card for each spell you've ca...,"[paradox, —, draw, a, card, for, each, spell, ..."
6,,,
7,"When Venerable Knight dies, put a +1/+1 counte...","When CARDNAME dies, put a +1/+1 counter on tar...","[when, cardname, dies, ,, put, a, +1/+1, count..."
9,"Whenever you draw your second card each turn, ...","Whenever you draw your second card each turn, ...","[whenever, you, draw, your, second, card, each..."
12,Kicker {R} (You may pay an additional {R} as y...,Kicker mana_R Flying When CARDNAME enters th...,"[kicker, mana_r, flying, when, cardname, enter..."
15,Defender (This creature can't attack.)\n{3}: D...,Defender mana_3 : Destroy CARDNAME and target...,"[defender, mana_3, :, destroy, cardname, and, ..."
18,Reach\nWhen Whiptongue Hydra enters the battle...,"Reach When CARDNAME enters the battlefield, de...","[reach, when, cardname, enters, the, battlefie..."


In [30]:
model = Word2Vec(data["oracle_text_tokenized"].dropna().to_list(), min_count=1, vector_size=100, window=5)
model.wv.key_to_index.keys()




In [31]:
data["oracle_text_mean_vector"] = data["oracle_text_tokenized"].dropna().apply(model.wv.get_mean_vector)


In [32]:
def get_similar_cards(i: int):
    df = data["oracle_text_mean_vector"].dropna().apply(lambda vector: scipy.spatial.distance.cosine(vector, data["oracle_text_mean_vector"].iloc[i]))
    data.sort_values("distance_from_card_0").head(5)


In [33]:
data["distance_from_card_0"] = data["oracle_text_mean_vector"].dropna().apply(lambda vector: scipy.spatial.distance.cosine(vector, data["oracle_text_mean_vector"].iloc[0]))

data.sort_values("distance_from_card_0").head(5)


Unnamed: 0,name,mana_cost,set,cmc,power,toughness,colors,oracle_text,keywords,type_line,released_at,reprint,oracle_text_clean,oracle_text_tokenized,oracle_text_mean_vector,distance_from_card_0
0,Fury Sliver,{5}{R},tsp,6.0,3,3,[R],All Sliver creatures have double strike.,[],Creature — Sliver,2006-10-06,False,All Sliver creatures have double strike.,"[all, sliver, creatures, have, double, strike, .]","[-0.10840225, 0.046149787, 0.00978528, 0.06053...",2.072176e-08
75459,Spitting Sliver,{4}{B},plc,5.0,3,3,[B],All Sliver creatures have first strike.,[],Creature — Sliver,2007-02-02,False,All Sliver creatures have first strike.,"[all, sliver, creatures, have, first, strike, .]","[-0.1030586, 0.034527756, 0.027147273, 0.06166...",0.022977
88835,Talon Sliver,{1}{W},tmp,2.0,1,1,[W],All Sliver creatures have first strike.,[],Creature — Sliver,1997-10-14,False,All Sliver creatures have first strike.,"[all, sliver, creatures, have, first, strike, .]","[-0.1030586, 0.034527756, 0.027147273, 0.06166...",0.022977
70732,Synchronous Sliver,{4}{U},plc,5.0,3,3,[U],All Sliver creatures have vigilance.,[],Creature — Sliver,2007-02-02,False,All Sliver creatures have vigilance.,"[all, sliver, creatures, have, vigilance, .]","[-0.10191517, 0.047366787, 0.02679268, 0.06875...",0.06345471
17533,Two-Headed Sliver,{1}{R},tsp,2.0,1,1,[R],All Sliver creatures have menace. (They can't ...,[],Creature — Sliver,2006-10-06,False,All Sliver creatures have menace.,"[all, sliver, creatures, have, menace, .]","[-0.10670858, 0.06355855, 0.030505238, 0.05853...",0.06693846
