In [20]:
# Load all packages necessary for analysis
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import timeit

# mtg_archive

## Gather

In [3]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')

In [5]:
# Code to create a function to see if url leads to a file that can be downloaded
def is_downloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [14]:
# code to see if url is downloadable
print(is_downloadable('https://www.mtgjson.com/json/AllSets.json'))

True


In [15]:
# code to download data as a response object and make it into a json object
mtg_archive = requests.get('https://www.mtgjson.com/json/AllSets.json')
mtg_archive_clean = mtg_archive.json()

In [16]:
# code to save mtg_archive_clean as a json file
with open('mtg_archive.txt', 'w') as outfile: 
    json.dump(mtg_archive_clean, outfile)

In [2]:
# code to load all cards data
with open('mtg_archive.txt', encoding='utf-8') as file:
    mtg_archive_clean = json.load(file)

## Assess

In [10]:
# code to assess the type of mtg_archive_clean
type(mtg_archive_clean)

dict

In [18]:
# code to get the 40th element from mtg_archive_clean as a list
list(mtg_archive_clean.items())[1]

('2ED',
 {'baseSetSize': 302,
  'block': 'Core Set',
  'boosterV3': ['rare',
   'uncommon',
   'uncommon',
   'uncommon',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common'],
  'cards': [{'artist': 'Richard Thomas',
    'borderColor': 'white',
    'colorIdentity': ['U'],
    'colors': ['U'],
    'convertedManaCost': 5.0,
    'edhrecRank': 11987,
    'flavorText': 'These spirits of the air are winsome and wild, and cannot be truly contained. Only marginally intelligent, they often substitute whimsy for strategy, delighting in mischief and mayhem.',
    'foreignData': [],
    'frameVersion': '1993',
    'hasFoil': False,
    'hasNonFoil': True,
    'isPaper': True,
    'isReprint': True,
    'layout': 'normal',
    'legalities': {'brawl': 'Legal',
     'commander': 'Legal',
     'duel': 'Legal',
     'future': 'Legal',
     'historic': 'Legal',
     'legacy': 'Legal',
     'modern': 'Legal',
     '

In [3]:
# code to see how many observations there are in mtg_archive_clean
len(list(mtg_archive_clean))

460

### Assessment

- need tcg player urls
- need a name list

## Clean

### Define
- write a loop that pulls out names and tcg player purchase url

### Code

In [34]:
# loop that pulls out names and tcg player purchase url
tcg_player_card_urls = []
card_names = mtg_archive_clean.keys()
tcg_names = []
card_errors = []
for i in card_names:
    try: 
        tcg_player_card_urls.append(mtg_archive_clean[i]['purchaseUrls']['tcgplayer'])
        tcg_names.append(mtg_archive_clean[i]['name'])
    except Exception as e:
        print(str(i) + ' ' + str(e))
        card_errors.append(i)

1996 World Champion 'purchaseUrls'
A Display of My Dark Power 'purchaseUrls'
A Reckoning Approaches 'purchaseUrls'
Abbot of Keral Keep 'purchaseUrls'
Abhorrent Overlord 'purchaseUrls'
Absolute Law 'purchaseUrls'
Abyssal Persecutor 'purchaseUrls'
Abyssal Specter 'purchaseUrls'
Abzan Ascendancy 'purchaseUrls'
Abzan Beastmaster 'purchaseUrls'
Academy Rector 'purchaseUrls'
Academy at Tolaria West 'purchaseUrls'
Acid Rain 'purchaseUrls'
Adanto, the First Fort 'purchaseUrls'
Adarkar Sentinel 'purchaseUrls'
Addle 'purchaseUrls'
Adun Oakenshield 'purchaseUrls'
Aeolipile 'purchaseUrls'
Aeronaut Tinkerer 'purchaseUrls'
Aether Hub 'purchaseUrls'
Aether Mutation 'purchaseUrls'
Aetherflux Reservoir 'purchaseUrls'
Aethersnatch 'purchaseUrls'
Aethersquall Ancient 'purchaseUrls'
Aetherstorm Roc 'purchaseUrls'
Aetherworks Marvel 'purchaseUrls'
Aftershock 'purchaseUrls'
Aggressive Instinct 'purchaseUrls'
Agyrem 'purchaseUrls'
Ainok Tracker 'purchaseUrls'
Aisling Leprechaun 'purchaseUrls'
Ajani Vengeant 

### Test

In [19]:
np.count_nonzero(tcg_player_card_urls), np.count_nonzero(tcg_player_card_urls)

(18122, 18122)

In [21]:
tcg_names[0], tcg_player_card_urls[0]

('"Ach! Hans, Run!"', 'https://mtgjson.com/links/85b366724beadefd')

### Define
- combine names with urls to make tcg_cards

### Code

In [36]:
# code to make a dataframe of the two tcg columns to make tcg_names dataframe
tcg_names = pd.DataFrame(data={'name': tcg_names})
tcg_names['url'] = tcg_player_card_urls

### Test

In [37]:
tcg_names.head()

Unnamed: 0,name,url
0,"""Ach! Hans, Run!""",https://mtgjson.com/links/85b366724beadefd
1,"""Rumors of My Death . . .""",https://mtgjson.com/links/f4da472f5769fc77
2,AWOL,https://mtgjson.com/links/e2cbf59017afe314
3,Abandon Hope,https://mtgjson.com/links/43a70ba6b21894cd
4,Abandon Reason,https://mtgjson.com/links/2550e0426610acfd


### Define
- get all the names and rarities 

### Code

In [None]:
mtg_rarities = pd.DataFrame(name = [], 
                            rarity = [])

for i in mtg_archive_clean.keys():
    names = mtg_archive_clean[i]['name']
    rarities = mtg_archive_clean[i]['rarity']

# standard_sets

## Gather

In [8]:
# pull standard cards from mtgjson
standard_sets = requests.get('https://www.mtgjson.com/json/Standard.json')
standard_sets_clean = standard_sets.json()

In [9]:
# code to save standard_sets as a json file 
with open('standard_sets_clean.txt', 'w') as outfile:
    json.dump(standard_sets_clean, outfile)

In [2]:
# code to load standard_sets_clean
with open('standard_sets_clean.txt', encoding='utf-8') as file:
    standard_sets_clean = json.load(file)

## Assess

In [6]:
list(standard_cards_clean.items())[3]

('RNA',
 {'baseSetSize': 259,
  'block': 'Guilds of Ravnica',
  'boosterV3': [['rare', 'mythic rare'],
   'uncommon',
   'uncommon',
   'uncommon',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'land',
   'marketing'],
  'cards': [{'artist': 'Izzy',
    'borderColor': 'black',
    'colorIdentity': ['U', 'W'],
    'colors': ['U', 'W'],
    'convertedManaCost': 3.0,
    'edhrecRank': 3718,
    'flavorText': '"In your misguided attempt to subvert the law, you have eloquently explained why the law must exist."',
    'foreignData': [{'flavorText': '„Dein fehlgeleiteter Versuch, die Gesetze zu unterlaufen, beweist, dass diese Gesetze dringend notwendig sind."',
      'language': 'German',
      'multiverseId': 457554,
      'name': 'Absorbieren',
      'text': 'Neutralisiere einen Zauberspruch deiner Wahl. Du erhältst 3 Lebenspunkte dazu.',
      'type': 'Spontanzauber'},
     {'flavorText': '"En tu intent

In [8]:
standard_sets_clean['ELD']['cards'][1]['rarity']

'rare'

### Assessment
- need names and rarity of the card in one dataset

## Clean

### Define
- use pandas to tidy the data by making it a dataframe with only two variables: name and rarity

### Code

In [13]:
# code to get rarity in a list by themselves
card_rarities = []
for i in standard_sets_clean.keys():
    current_set = standard_sets_clean[i]['cards']
    for a in np.arange(np.count_nonzero(current_set)):
        rarity = current_set[a]['rarity']
        card_rarities.append(rarity)

In [16]:
# code to get names in a list by themselves
card_names = []
for i in standard_sets_clean.keys():
    current_set = standard_sets_clean[i]['cards']
    for a in np.arange(np.count_nonzero(current_set)):
        name = current_set[a]['name']
        card_names.append(name)

In [23]:
# code to create dataframe of card_rarities and card_names
mtg_card_rarities = pd.DataFrame({'name': card_names, 'rarity': card_rarities})
mtg_card_rarities_clean = mtg_card_rarities.copy()
mtg_card_rarities_clean.to_csv('mtg_card_rarities_clean.csv', index=False)

### Test

In [12]:
card_rarities

['rare',
 'rare',
 'mythic',
 'uncommon',
 'uncommon',
 'uncommon',
 'uncommon',
 'uncommon',
 'common',
 'uncommon',
 'uncommon',
 'common',
 'common',
 'common',
 'common',
 'common',
 'rare',
 'rare',
 'common',
 'rare',
 'common',
 'common',
 'common',
 'uncommon',
 'uncommon',
 'uncommon',
 'uncommon',
 'uncommon',
 'common',
 'rare',
 'rare',
 'common',
 'common',
 'uncommon',
 'rare',
 'rare',
 'rare',
 'rare',
 'uncommon',
 'mythic',
 'mythic',
 'mythic',
 'mythic',
 'common',
 'uncommon',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'rare',
 'uncommon',
 'uncommon',
 'common',
 'rare',
 'rare',
 'rare',
 'mythic',
 'rare',
 'rare',
 'uncommon',
 'uncommon',
 'common',
 'common',
 'uncommon',
 'common',
 'common',
 'common',
 'common',
 'common',
 'common',
 'rare',
 'rare',
 'uncommon',
 'uncommon',
 'common',
 'rare',
 'rare',
 'uncommon',
 'common',
 'uncommon',
 'uncommon',
 'mythic',
 'mythic',
 'common',
 'uncommon',
 'uncommon',
 'unc

In [17]:
card_names

['Acclaimed Contender',
 'Acclaimed Contender',
 'Alela, Artful Provocateur',
 'All That Glitters',
 'Bring to Life',
 'Animating Faerie',
 'Bring to Life',
 'Animating Faerie',
 'Arcane Signet',
 "Arcanist's Owl",
 'Archon of Absolution',
 'Ardenvale Paladin',
 'Dizzying Swoop',
 'Ardenvale Tactician',
 'Dizzying Swoop',
 'Ardenvale Tactician',
 'Ayara, First of Locthwain',
 'Ayara, First of Locthwain',
 'Bake into a Pie',
 'Banish into Fable',
 'Barge In',
 'Barrow Witches',
 'Bartered Cow',
 'Fertile Footsteps',
 'Beanstalk Giant',
 'Fertile Footsteps',
 'Beanstalk Giant',
 'Belle of the Brawl',
 'Beloved Princess',
 'Blacklance Paragon',
 'Blacklance Paragon',
 'Bloodhaze Wolverine',
 'Blow Your House Down',
 'Bog Naughty',
 'Stomp',
 'Bonecrusher Giant',
 'Stomp',
 'Bonecrusher Giant',
 'Bramblefort Fink',
 'Petty Theft',
 'Brazen Borrower',
 'Petty Theft',
 'Brazen Borrower',
 'Brimstone Trebuchet',
 'Burning-Yard Trainer',
 'Castle Ardenvale',
 'Castle Ardenvale',
 'Castle Ember

In [18]:
np.count_nonzero(card_rarities), np.count_nonzero(card_names)

(1679, 1679)

In [22]:
mtg_card_rarities_clean

Unnamed: 0,name,rarity
0,Acclaimed Contender,rare
1,Acclaimed Contender,rare
2,"Alela, Artful Provocateur",mythic
3,All That Glitters,uncommon
4,Bring to Life,uncommon
5,Animating Faerie,uncommon
6,Bring to Life,uncommon
7,Animating Faerie,uncommon
8,Arcane Signet,common
9,Arcanist's Owl,uncommon


In [2]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')

### Define
- drop duplicates

### Code

In [3]:
# drop duplicates in mtg_card_rarities_clean
mtg_card_rarities_clean.drop_duplicates(inplace=True)

In [6]:
mtg_card_rarities_clean.to_csv('mtg_card_rarities_clean.csv', index=False)

### Test

In [5]:
mtg_card_rarities_clean.duplicated().sum()

0

In [7]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')
print(mtg_card_rarities_clean)

                                name    rarity
0                Acclaimed Contender      rare
1          Alela, Artful Provocateur    mythic
2                  All That Glitters  uncommon
3                      Bring to Life  uncommon
4                   Animating Faerie  uncommon
5                      Arcane Signet    common
6                     Arcanist's Owl  uncommon
7               Archon of Absolution  uncommon
8                  Ardenvale Paladin    common
9                     Dizzying Swoop    common
10               Ardenvale Tactician    common
11         Ayara, First of Locthwain      rare
12                   Bake into a Pie    common
13                 Banish into Fable      rare
14                          Barge In    common
15                    Barrow Witches    common
16                      Bartered Cow    common
17                 Fertile Footsteps  uncommon
18                   Beanstalk Giant  uncommon
19                Belle of the Brawl  uncommon
20           

# deck_type_urls

## Gather

In [9]:
# make a webcrawler to gather the data from mtgtop8.com on competitive decks
url = 'https://www.mtgtop8.com/format?f=ST'
url_standard_page_response = requests.get(url)

In [10]:
# save the response
with open('url_standard_page_response.html', mode='wb') as file:
    file.write(url_standard_page_response.content)

In [2]:
# open response as a BeautifulSoup object
with open('url_standard_page_response.html') as file:
    soup = BeautifulSoup(file, 'lxml')

In [3]:
url = {'url': soup.find_all(href=re.compile(r"archetype\?a"))}
deck_type_urls = pd.DataFrame(data=url)

In [4]:
# create column for deck type and clean the column
deck_type_urls.url = deck_type_urls.url.astype('str')
deck_type_urls['type'] = deck_type_urls.url.str.extract('(>.+<)')
deck_type_urls['type'] = deck_type_urls.type.str.replace('>', '')
deck_type_urls['type'] = deck_type_urls.type.str.replace('<', '')
deck_type_urls['type'] = deck_type_urls.type.str.strip()

In [5]:
# extract url from the url column
deck_type_urls['url'] = deck_type_urls.url.str.extract('(archetype.+f\=ST)')

In [6]:
# add root url to the data
urls = []
types = []

for i in np.arange(deck_type_urls.shape[0]):
    urls.append(str('https://www.mtgtop8.com/') + deck_type_urls.url[i])
    types.append(deck_type_urls.type[i] + str('.html'))

In [7]:
deck_type_urls['url'] = urls
deck_type_urls['type'] = types
deck_type_urls['type'] = deck_type_urls.type.str.replace(' ', '_')
deck_type_urls['type'] = deck_type_urls.type.str.replace('/', '-')

In [8]:
# create web crawler to request html pages from deck_type_urls

# create new path and name it newpath
newpath = str(os.path.abspath(os.curdir)) + '\\type_html_files'

# check to make sure that there is no path that matches newpath and create a folder called html_files if there isn't
if not os.path.exists(newpath):
    os.makedirs(newpath)

# change current directory to newpath
os.chdir(newpath)

# scrape all html files from urls in deck_type_urls and put them in html_files folder
for i in np.arange(deck_type_urls.shape[0]):
    type_html = requests.get(deck_type_urls.url[i])
    with open(deck_type_urls.type[i], 'wb') as file:
        file.write(type_html.content)

# deck_urls

## Gather

In [9]:
# get a list of file names in html_files folder and name it dir_names
dir_names = []
cur_dir = os.path.abspath(os.curdir)

with os.scandir(cur_dir) as folder:
    for file in folder: 
        if file.is_file():
            dir_names.append(file.name)

In [10]:
# get a list of all urls in all files
deck_urls = []
for i in np.arange(np.count_nonzero(dir_names)):
    with open(dir_names[i]) as file:
        soup = BeautifulSoup(file, 'lxml')
    urls = soup.find_all(href=re.compile(r'event\?e\=.+\&d\=.+\&f\=ST'))
    for a in urls:
        deck_urls.append(a)

In [11]:
# turn deck urls into a dataframe
deck_urls = {'url': deck_urls}
deck_urls = pd.DataFrame(data=deck_urls)

In [12]:
# get deck names and clean them up
deck_urls.url = deck_urls.url.astype('str')
deck_urls['name'] = deck_urls.url.str.extract('(>.+<)')
deck_urls.name = deck_urls.name.str.replace('>', '')
deck_urls.name = deck_urls.name.str.replace('<', '')
deck_urls.name = deck_urls.name.str.strip()

In [13]:
# extract url from the url column
deck_urls['url'] = deck_urls.url.str.extract('(event.+f\=ST)')

In [14]:
# add root url to the data
urls = []
names = []

for i in np.arange(deck_urls.shape[0]):
    urls.append(str('https://www.mtgtop8.com/') + deck_urls.url[i])
    names.append(deck_urls.name[i] + str('.html'))

In [15]:
# clean urls and name
deck_urls['url'] = urls
deck_urls['name'] = names
deck_urls['name'] = deck_urls.name.str.replace(' ', '_')
deck_urls['name'] = deck_urls.name.str.replace('/', '-')

## decks

In [19]:
# create a web crawler to request html pages from deck_type_urls

# create a new path and name it newpath
oldpath = str(os.path.abspath(os.curdir))
newpath = oldpath.replace('\\type_html_files', '\\deck_html_files')

# check to make sure that there is no path that matches newpath and create a folder called html_files if there isn't
if not os.path.exists(newpath):
    os.makedirs(newpath)

# change the current directory to newpath
os.chdir(newpath)

# scrape all html files from urls in deck_urls and put them in html_files folder
deck_errors = []

for i in np.arange(deck_urls.shape[0]):
    try:
        deck_html = requests.get(deck_urls.url[i])
        with open(deck_urls.name[i], 'wb') as file:
            file.write(deck_html.content)
    except Exception as e:
        print(str(i) + ' ' + str(e))
        deck_errors.append(i)

134 [Errno 22] Invalid argument: 'Bant_Golos.html'
281 [Errno 22] Invalid argument: 'Grixis_"spacebar"_Midrange.html'
