In [None]:
# Data Loading
import requests

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Manipulation
import numpy as np
import pandas as pd

# Machine Learning
import torch
from torch.nn import LSTM, Linear, Sigmoid
from torch.nn.utils.rnn import pack_sequence

We begin by loading card data from [Scryfall](https://scryfall.com/), a Magic: The Gathering search engine and data aggregator. We clean the data by removing weirdly formatted cards, as well as cards that contain rarely used characters.

In [None]:
cards_raw = pd.read_json('data/scryfall-data.json')

# Remove all multi-faced or other weirdly formatted cards
cards_raw = cards_raw[cards_raw['layout'] == 'normal']

# Remove all digital-only cards
cards_raw = cards_raw[~cards_raw['digital']]

# Remove all joke cards
cards_raw = cards_raw[cards_raw['set_type'] != 'funny']

# Remove cards with no text
cards_raw = cards_raw[cards_raw['oracle_text'].str.len() > 0]

### The next few steps reduce the number of characters we will have to one-hot encode ###
# Remove cards that have uncommon characters
cards_raw = cards_raw[~cards_raw['oracle_text'].str.contains(r'[!%?úíÉ\[\]]')]

# Fix index
cards_raw = cards_raw.reset_index(drop=True)

# Replace "minus" with "hyphen", as they fulfil the same purpose
cards_raw['oracle_text'] = cards_raw['oracle_text'].str.replace('−', '-')

# Replace semicolon with comma, as they are close enough
cards_raw['oracle_text'] = cards_raw['oracle_text'].str.replace(';', ',')

### Generalize card names appearing in rules text ###
# This is because the name of the card is irrelevant to its effect, 
# and cards can even be reprinted with different names
def generalize_name(card):
    generalized = card.copy()
    generalized['oracle_text'] = card['oracle_text'].replace(card['name'], '~')
    return generalized

cards = cards_raw.apply(generalize_name, axis=1)

We then load data from EDHREC, another data aggregator, which has tags on many cards relating to their purpose (e.g. removing threats, playing more mana, etc.).

In [None]:
# Try to load EDHREC data; if it doesn't exist, regenerate it
try:
    edhrec_tags = pd.read_csv('data/edhrec_data.csv', index_col=0)
except:
    # Replace all special characters with hyphens, other than apostrophes, to match EDHREC format
    edhrec_names = cards['name'].str.lower().str.replace('\'', '').str.replace(r'\W+', '-', regex=True)

    # Split the names into 300 card chunks to fit EDHREC API requirements, then query the API
    edhrec_name_chunks = np.array_split(edhrec_names.to_numpy(), range(300, edhrec_names.shape[0], 300))
    edhrec_results = [ requests.post('https://edhrec.com/api/cards/', json={'format': 'dict', 'names': list(chunk)}).json()['cards'] for chunk in edhrec_name_chunks ]

    # Get the tags for the cards
    edhrec_tags = pd.concat([pd.DataFrame(res).T for res in edhrec_results ])['tags']
    # Reindex tags based on card name
    edhrec_tags = pd.merge(edhrec_names, edhrec_tags, left_on='name', right_index=True)['tags']

    # Save to CSV
    edhrec_tags.to_csv('data/edhrec_data.csv')

# Add the tags to our card data
cards_tagged = cards.join(edhrec_tags)
# Drop cards with no tags
cards_tagged = cards_tagged[~cards_tagged['tags'].isnull()].reset_index(drop=True)

Finally, we load the "Oracle text" of the cards (the text of the cards under the rules, including errata and updates for uniformity). We convert it into one-hot encoding by characters, then into a packed sequence of tensors for batching.

In [None]:
# The size of each character tensor
INPUT_SIZE = 80

# Get the frequency of each character in the corpus
char_freqs = pd.Series(list(''.join(cards_tagged['oracle_text']))).value_counts()
# Ensure we only have 80 characters appearing
assert(char_freqs.shape[0] == INPUT_SIZE)

# Mapping from a character into its index
char_indices = pd.Series(range(80), index=char_freqs.index)

# Turn the text of all cards into a packed (jagged) tensor
tensor_text = pack_sequence([torch.tensor(np.identity(INPUT_SIZE)[char_indices[list(text)]]) for text in cards_tagged['oracle_text']], enforce_sorted=False)

In [None]:
cards_tagged['tags']