In [1]:
import requests
import os
import json
import pandas as pd
import numpy as np
import io

def get_dataset(output_dir:str,output_file:str,url:str, encoding = 'utf8'):
    response = requests.get(url + '.sha256' )
    requires_update = False
    try:
        outf = open(os.path.join(output_dir,output_file + '.sha256'),'rt')
        if outf.read() != response.text:
            requires_update = True
        outf.close()
    except:
        requires_update = True

    if(requires_update):
        with open(os.path.join(output_dir,output_file + '.sha256'), 'wt') as outf:
            outf.write(response.text)
            outf.close()

        response = requests.get(url)
        with open(os.path.join(output_dir,output_file), 'wt', encoding = encoding) as outf:
            outf.write(response.text)
            outf.close()

        print(output_file,'updated')
    return

Get all the mtgjson files needed.

In [2]:
dataset_path = "./mtgjson_dataset"
keywords_file = 'Keywords.json'
sets_file = 'SetList.json'
types_file = 'CardTypes.json'
identifiers_file = 'AllIdentifiers.json'

get_dataset(dataset_path, keywords_file, "http://mtgjson.com/api/v5/Keywords.json")
get_dataset(dataset_path, sets_file, "http://mtgjson.com/api/v5/SetList.json")
get_dataset(dataset_path, types_file, "http://mtgjson.com/api/v5/CardTypes.json")
get_dataset(dataset_path, identifiers_file , "http://mtgjson.com/api/v5/AllIdentifiers.json")

## Keywords database

In [3]:
#Obtain an useful list of keywords
keywords_raw = pd.read_json(os.path.join(dataset_path,keywords_file))
abilityWords = keywords_raw.data.at['abilityWords']
keywordAbilities = keywords_raw.data.at['keywordAbilities']
keywordActions = keywords_raw.data.at['keywordActions']

## Types database

In [4]:
types_raw = pd.read_json(os.path.join(dataset_path,types_file))
creatureTypes_raw = types_raw.data.at['creature']
creatureTypes = creatureTypes_raw['subTypes']

In [5]:
j = json.load(open(os.path.join(dataset_path,sets_file),encoding='UTF8'))
sets = pd.DataFrame(j['data'])

## Main card database
### Get all Cards by UUID
We are going to deal with gameplay features, so everything not directly impacting it gets dropped. 

In [6]:
cards_set_data = pd.read_json(os.path.join(dataset_path,identifiers_file)).data.drop(index=['date','version'])
data = pd.read_json(io.StringIO(json.dumps(cards_set_data.reset_index(drop=True).to_list())))
del cards_set_data

# Minor sanitation

# Drop duplicates based on name; gameplay-wise, same name is the same card and a duplicate.
data = data.drop_duplicates(['name'])
# Quick and obvious feature selection
# Drop all columns with a high percent of nulls
null_threshold = 90.0
data = data.drop(np.array(data.columns)[pd.Series(data.isnull().sum() * 100 / len(data)) >= null_threshold],axis=1)
# Drop commercial, duplicate identifiers and artistitc data.
artistic_features = ['borderColor', 'finishes', 'foreignData', 'frameVersion','securityStamp','frameEffects','layout']
commercial_features = ['sourceProducts','variations','hasFoil','hasNonFoil','promoTypes','purchaseUrls','isReprint', 'isStarter','boosterTypes','availability']
# I will leave the card name for now.
identifier_features = ['uuid','originalText','originalType','identifiers','language','number','printings','artistIds']
data = data.drop(artistic_features,axis=1).drop(commercial_features,axis=1).drop(identifier_features,axis=1)
# Not so obivious filtering: the 'un-sets' are weird and will add unnecessary noise. 
unsets_codes = ['UGL','UNH','UST','UND','UNF']
data = data.loc[~data.setCode.isin(unsets_codes)]

### Missing values imputation

In [7]:
# Some useful definitions
# Given a pd.Series with List values, checks if any of those are in keywords. Returns a dummy pandas frame with keywords.shape features.
def ListOneHot(X, y=None, keywords=None, count_threshold=0):
    result = pd.DataFrame(X.apply(lambda x: np.in1d(keywords,np.array(x))).to_list(),columns=keywords, index=X.index).astype(bool)
    return result
# Givem a pd.Series with string values, check if any contains the string in keywords. Returns a dummy pandas frame with keywords.shape features.
def StringOneHot(X, y=None, keywords=None, count_threshold=0):
    frames = []
    for keyword in keywords:
        frames.append(X.fillna("").str.contains(keyword,regex=True,case=False).to_numpy())
    result =  pd.DataFrame(np.array(frames).transpose(),columns=keywords, index = X.index).astype(bool)
    return result

# Empty mana cost, converted mana cost and mana value just mean that, empty. I will replace with an "none' value for each one.
data.convertedManaCost = data.convertedManaCost.fillna(0.0)
data.manaCost = data.manaCost.fillna('None')
data.manaValue = data.manaValue.fillna(0.0)
# A null edhrec rank means mostly terrible cards for commander, setting the null for the max (worse) rank.
data.edhrecRank = data.edhrecRank.fillna(data.edhrecRank.max())
# Same for saltiness, not saltiness means not votes at all = 0.0 salt.
data.edhrecSaltiness = data.edhrecSaltiness.fillna(0.0)
# Flavor null is just empty.
data.flavorText = data.flavorText.fillna('None')
# Null keywords mean no keywords
data.keywords = data.keywords.apply(lambda x: ['None'] if (pd.Series(x,dtype='object').isnull().any() or (str(x) == '')) else x)
data.subtypes = data.subtypes.apply(lambda x: ['None'] if (pd.Series(x,dtype='object').isnull().any() or (str(x) == '')) else x)
data.supertypes = data.supertypes.apply(lambda x: ['None'] if (pd.Series(x,dtype='object').isnull().any() or (str(x) == '')) else x)
data.types = data.types.apply(lambda x: ['None'] if (pd.Series(x,dtype='object').isnull().any() or (str(x) == '')) else x)
# Same for legalities
data.legalities = data.legalities.apply(lambda x: {'None':None}  if (pd.Series(x,dtype='object').isnull().any() or str(x) == '') else x)
# Most are tokens, and the other 4 are instances too. They don't have a rarity per se, but it is guaranteed drop on each booster. So, lets default to Common.
data.rarity = data.rarity.fillna('common')
# Cards with no rules have rulings as NaN. Just fill with empty string
data.rulings = data.rulings.fillna('None')
# Same with text
data.text = data.text.fillna('None')
# Complete empty artists
data.artist = data.artist.fillna('None')
# Impute power and toughness as 0
data.power = data.power.fillna(0)
data.toughness = data.toughness.fillna(0)
data.setCode = data.setCode.fillna('None')
data.type = data.type.fillna('None')

colors = ['W','U','B','R','G']
def list_to_color_string(series):
    result = ''
    if(series.W): result +='W'
    if(series.U): result +='U'
    if(series.B): result +='B'
    if(series.R): result +='R'
    if(series.G): result +='G'
    if(result == ''): result = 'C' 
    return result

data['colors'] = ListOneHot(X=data.colors, keywords=colors).apply(list_to_color_string,axis=1)
data['colorIdentity'] = ListOneHot(X=data.colorIdentity, keywords=colors).apply(list_to_color_string,axis=1)
data.supertypes = data.supertypes.apply(lambda x: ','.join(x))
data.loc[data.supertypes == '', 'supertypes'] = 'None'
data.subtypes = data.subtypes.apply(lambda x: ','.join(x))
data.loc[data.subtypes == '', 'subtypes'] = 'None'
data.types = data.types.apply(lambda x: ','.join(x))
data.loc[data.types == '', 'types'] = 'None'

if data.isnull().any().any():
    raise Exception("Thera are NaN values in dataset")


### Replace instances of the card name inside its text for a common token.
Card names in each sample will lead to noise in any language processing application and the fact that it is referencing itself is sufficient for any analysis.

In [8]:
self_token = 'this_card'
data.text = data.apply(lambda x : x['text'].replace(str(x['name']), self_token), axis=1)

### Save cards with non-empty text for NLP

In [9]:
data.loc[data.text != 'None'].to_csv(os.path.join(dataset_path,'cards_nlp.csv'))

In [10]:
data.subtypes.loc[data.subtypes == '']

Series([], Name: subtypes, dtype: object)