In [1]:
# Load the data using msgspec and pandas
import msgspec
import datetime
import pandas as pd
from pathlib import Path

# Define the Card object to specify the different fields
class Card(msgspec.Struct, dict=True):
    name: str | None = None
    mana_cost: str | None = None
    set: str | None = None
    cmc: float | None = None
    power: str | None = None
    toughness: str | None = None
    colors: list[str] | None = None
    oracle_text: str | None = None
    keywords: list[str] | None = None
    type_line: str | None = None
    released_at: datetime.date | None = None
    reprint: bool | None = None
    image_uris: dict | None = None
    id: str | None = None
    # legalities: dict[str, str] | None = None

# Decode json into Card objects
with open("../data/default-cards-20240508090556.json", "rb") as json_file:
    dataset = msgspec.json.decode(json_file.read(), type=list[Card])

# Load into to a pandas dataframe
data = pd.DataFrame([msgspec.structs.asdict(card) for card in dataset])

# Remove reprints
data = data[data["reprint"] == False]


In [8]:
import urllib.request
import shutil
import time
from tqdm.notebook import tqdm_notebook
no_art = []
def download_art(row):
    if row["image_uris"] is None:
        no_art.append(row["id"])
        return
    output_file = Path("../data/art/", row["id"] + ".jpg")
    if output_file.exists():
        return
    response = urllib.request.urlopen(row["image_uris"]["art_crop"])
    with open(output_file, 'wb') as writer:
        shutil.copyfileobj(response, writer, 5*1024*1024)
    time.sleep(0.1)
    # (row["id"], row["image_uris"]["art_crop"]))

tqdm_notebook.pandas()
data[["id", "image_uris"]].progress_apply(download_art, axis=1)


  0%|          | 0/38378 [00:00<?, ?it/s]

0        None
1        None
3        None
5        None
6        None
         ... 
94832    None
94833    None
94836    None
94840    None
94841    None
Length: 38378, dtype: object