The goal of this notebook is to establish some basics for the rest of the dataset, such as how to access the data and how it will be cleaned to suit our purposes. 

# Importing and Accessing Data

In [1]:
import json

path = "cards.json"
with open(path, "r", encoding="utf8") as f:
    raw = f.read()
    data = json.loads(raw)

In [14]:
print(type(data))
print(type(data[0]))
print(data[0].keys())

# So you can iterate through data, and each element is a dictionary.

<class 'list'>
<class 'dict'>
dict_keys(['object', 'id', 'oracle_id', 'multiverse_ids', 'mtgo_id', 'mtgo_foil_id', 'tcgplayer_id', 'cardmarket_id', 'name', 'lang', 'released_at', 'uri', 'scryfall_uri', 'layout', 'highres_image', 'image_status', 'image_uris', 'mana_cost', 'cmc', 'type_line', 'oracle_text', 'colors', 'color_identity', 'keywords', 'legalities', 'games', 'reserved', 'foil', 'nonfoil', 'finishes', 'oversized', 'promo', 'reprint', 'variation', 'set_id', 'set', 'set_name', 'set_type', 'set_uri', 'set_search_uri', 'scryfall_set_uri', 'rulings_uri', 'prints_search_uri', 'collector_number', 'digital', 'rarity', 'flavor_text', 'card_back_id', 'artist', 'artist_ids', 'illustration_id', 'border_color', 'frame', 'full_art', 'textless', 'booster', 'story_spotlight', 'edhrec_rank', 'prices', 'related_uris', 'purchase_uris'])


In [4]:
data[12]['oracle_text']

# So we will also have to remove \n and replace them wiht something more suitable, probably ". " (a period followed by a space)

"Enchant creature\nWhen Waterknot enters the battlefield, tap enchanted creature.\nEnchanted creature doesn't untap during its controller's untap step."

# Cleaning

4 things need to be done 
- Remove the card from the dataset if it is in a joke or memorabilia set
- Make the oracle text look nice (replace `'\n'`)
- Remove the cardname from its oracle text
- Put into a bucket based on color identity

In [7]:
import re

colorDict = {
    'W': [], # white
    'U': [], # blue
    'B': [], # black
    'R': [], # red
    'G': [], # green
    'colorless': [] # everything else
}

for i in data:
    # Skip extra cards
    st = i['set_type'] 
    if st == 'funny' or st == "memorabilia":
        continue

    # Clean \n and sub out cardname 
    if 'oracle_text' in i.keys():
        text = i['oracle_text'] 
        text = re.sub("\n", " ", text)
        text = re.sub(re.escape(i['name']), "CARDNAME", text)
        i['oracle_text'] = text
    else:
        continue

    # Put into color buckets
    colors = i['color_identity']
    if not colors:
        colorDict['colorless'].append(i)
        continue
    
    for color in colors:
        colorDict[color].append(i)

return colorDict

In [12]:
colorDict

{'W': [], 'U': [], 'B': [], 'R': [], 'G': [], 'colorless': []}