In [17]:
# using json
import json

with open("../data/oracle-cards-20240301220151.json") as json_file:
    dataset = json.load(json_file)

print(f"Total number of cards: {len(dataset)}.")
print(f"Number of fields per card: {len(dataset[0].keys())}.")
print("\n".join([str(x) for x in enumerate(dataset[0].keys())]))


Total number of cards: 31064.
Number of fields per card: 61.
(0, 'object')
(1, 'id')
(2, 'oracle_id')
(3, 'multiverse_ids')
(4, 'mtgo_id')
(5, 'mtgo_foil_id')
(6, 'tcgplayer_id')
(7, 'cardmarket_id')
(8, 'name')
(9, 'lang')
(10, 'released_at')
(11, 'uri')
(12, 'scryfall_uri')
(13, 'layout')
(14, 'highres_image')
(15, 'image_status')
(16, 'image_uris')
(17, 'mana_cost')
(18, 'cmc')
(19, 'type_line')
(20, 'oracle_text')
(21, 'colors')
(22, 'color_identity')
(23, 'keywords')
(24, 'legalities')
(25, 'games')
(26, 'reserved')
(27, 'foil')
(28, 'nonfoil')
(29, 'finishes')
(30, 'oversized')
(31, 'promo')
(32, 'reprint')
(33, 'variation')
(34, 'set_id')
(35, 'set')
(36, 'set_name')
(37, 'set_type')
(38, 'set_uri')
(39, 'set_search_uri')
(40, 'scryfall_set_uri')
(41, 'rulings_uri')
(42, 'prints_search_uri')
(43, 'collector_number')
(44, 'digital')
(45, 'rarity')
(46, 'flavor_text')
(47, 'card_back_id')
(48, 'artist')
(49, 'artist_ids')
(50, 'illustration_id')
(51, 'border_color')
(52, 'frame')


In [2]:
# using msgspec
import msgspec

class Card(msgspec.Struct, dict=True):
    name: str | None = None
    mana_cost: str | None = None
    cmc: float | None = None
    power: str | None = None
    toughness: str | None = None
    colors: list[str] | None = None
    oracle_text: str | None = None
    keywords: list[str] | None = None
    type_line: str | None = None
    # legalities: dict[str, str] | None = None

with open("../data/oracle-cards-20240301220151.json", "rb") as json_file:
    dataset = msgspec.json.decode(json_file.read(), type=list[Card])

print(dataset[3])

def printdict(d: dict | list[tuple]):
    if isinstance(d, dict):
        d = d.items()
    for k, v in d:
        print(f"{k}: {v}")


Card(name='Storm Crow', mana_cost='{1}{U}', cmc=2.0, power='1', toughness='2', colors=['U'], oracle_text="Flying (This creature can't be blocked except by creatures with flying or reach.)", keywords=['Flying'], type_line='Creature — Bird')


In [3]:
# convert to pandas
import pandas
import itertools
from collections import Counter

df = pandas.DataFrame([msgspec.structs.asdict(card) for card in dataset])
features = df[["name"]].copy()
print(df.head())

# df["name"] = df["name"].str.lower()
# df["type_line"] = df["type_line"].str.lower()


                  name mana_cost  cmc power toughness colors  \
0           Static Orb       {3}  3.0  None      None     []   
1  Sensory Deprivation       {U}  1.0  None      None    [U]   
2       Road of Return    {G}{G}  2.0  None      None    [G]   
3           Storm Crow    {1}{U}  2.0     1         2    [U]   
4       Walking Sponge    {1}{U}  2.0     1         1    [U]   

                                         oracle_text   keywords  \
0  As long as Static Orb is untapped, players can...         []   
1   Enchant creature\nEnchanted creature gets -3/-0.  [Enchant]   
2  Choose one —\n• Return target permanent card f...  [Entwine]   
3  Flying (This creature can't be blocked except ...   [Flying]   
4  {T}: Target creature loses your choice of flyi...         []   

            type_line  
0            Artifact  
1  Enchantment — Aura  
2             Sorcery  
3     Creature — Bird  
4   Creature — Sponge  


In [4]:
# Numerical (cmc, power, toughness)
def is_float(s: str):
    try:
        f = float(s)
        return (1.0, f)
    except:
        return (0.0, float("nan"))
        
features["power_is_numeric"], features["power_float"] = zip(*df["power"].apply(is_float))
features["toughness_is_numeric"], features["toughness_float"] = zip(*df["toughness"].apply(is_float))
features["cmc"] = df["cmc"]


In [5]:
# Types
df["card_types"] = df["type_line"].str.split(" ")

unique_types_counts = Counter(itertools.chain.from_iterable(df["card_types"]))
unique_types_counts.pop("—")

def get_types(list_of_types: list):
    out = []
    for card_type in unique_types_counts.keys():
        if card_type in list_of_types:
            out.append(1.0)
        else:
            out.append(0.0)
    return out

df["is_type"] = df["card_types"].apply(get_types)
df_types = pandas.DataFrame(df["is_type"].to_list(), index= df.index)
df_types.columns = [f"is_type_{key}" for key in list(unique_types_counts.keys())]

features = features.join(df_types)


In [6]:
# Colors
unique_colors_counts = Counter(itertools.chain.from_iterable(df["colors"].dropna()))

def get_colors(list_of_colors: list):
    if not list_of_colors:
        return [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    out = [0.0]
    for color in unique_colors_counts.keys():
        if color in list_of_colors:
            out.append(1.0)
        else:
            out.append(0.0)
    return out

df["is_color"] = df["colors"].apply(get_colors)
df_colors = pandas.DataFrame(df["is_color"].to_list(), index= df.index)
df_colors.columns = [f"is_color_{key}" for key in ["C"] + list(unique_colors_counts.keys())]

features = features.join(df_colors)


In [7]:
# Keywords
unique_keywords_counts = Counter(itertools.chain.from_iterable(df["keywords"]))

def get_keywords(list_of_keywords: list):
    if len(list_of_keywords) == 0:
        return [0.0]*len(unique_keywords_counts)
    out = []
    for keyword in unique_keywords_counts.keys():
        if keyword in list_of_keywords:
            out.append(1.0)
        else:
            out.append(0.0)
    return out

df["is_keyword"] = df["keywords"].apply(get_keywords)
df_keywords = pandas.DataFrame(df["is_keyword"].to_list(), index= df.index)
df_keywords.columns = [f"is_keyword_{key}" for key in list(unique_keywords_counts.keys())]

features = features.join(df_keywords)


In [8]:
query = features
query = query.drop(columns=["power_is_numeric", "toughness_is_numeric"])
query = query.set_index("name")
import heapq
while True:
    # calculate max median split for all features
    num_cards = len(query)
    split_ratios = []
    for feature in query.columns:
        med = query[feature].median()
        split_above = (query[feature] >= med).sum()
        split_below = (query[feature] < med).sum()

        if split_below > split_above:
            # use split_above condition
            heapq.heappush(split_ratios, (-split_above / num_cards, feature, ">=", med))
        else:
            heapq.heappush(split_ratios, (-split_below / num_cards, feature, "<", med))
    
    question = heapq.heappop(split_ratios)
    answer = input(f"Q: Is {question[1]} {question[2]} {question[3]}? >= ")
    if answer == "y":
        query = query[query[question[1]] >= question[3]]
    else:
        query = query[query[question[1]] < question[3]]

    # remove singular columns
    query = query.loc[:, ~(query == df.iloc[0]).all()]

    break
