In [3]:
# Load all packages necessary for analysis
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import timeit
import filecmp
import url_stripper

# mtg_archive

## Gather

In [4]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')

In [5]:
# Code to create a function to see if url leads to a file that can be downloaded
def is_downloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [6]:
# code to see if url is downloadable
print(is_downloadable('https://www.mtgjson.com/json/AllSets.json'))

True


In [7]:
# code to download data as a response object and make it into a json object
mtg_archive = requests.get('https://www.mtgjson.com/json/AllSets.json')
mtg_archive_clean = mtg_archive.json()

In [8]:
# code to save mtg_archive_clean as a json file
with open('mtg_archive.txt', 'w') as outfile: 
    json.dump(mtg_archive_clean, outfile)

In [9]:
# code to load all cards data
with open('mtg_archive.txt', encoding='utf-8') as file:
    mtg_archive_clean = json.load(file)

## Assess

In [10]:
# code to assess the type of mtg_archive_clean
type(mtg_archive_clean)

dict

In [11]:
# code to get the 40th element from mtg_archive_clean as a list
list(mtg_archive_clean.items())[1]

('2ED',
 {'baseSetSize': 302,
  'block': 'Core Set',
  'boosterV3': ['rare',
   'uncommon',
   'uncommon',
   'uncommon',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common',
   'common'],
  'cards': [{'artist': 'Richard Thomas',
    'borderColor': 'white',
    'colorIdentity': ['U'],
    'colors': ['U'],
    'convertedManaCost': 5.0,
    'edhrecRank': 11979,
    'flavorText': 'These spirits of the air are winsome and wild, and cannot be truly contained. Only marginally intelligent, they often substitute whimsy for strategy, delighting in mischief and mayhem.',
    'foreignData': [],
    'frameVersion': '1993',
    'hasFoil': False,
    'hasNonFoil': True,
    'isPaper': True,
    'isReprint': True,
    'layout': 'normal',
    'legalities': {'brawl': 'Legal',
     'commander': 'Legal',
     'duel': 'Legal',
     'future': 'Legal',
     'historic': 'Legal',
     'legacy': 'Legal',
     'modern': 'Legal',
     '

In [12]:
# code to see how many observations there are in mtg_archive_clean
len(list(mtg_archive_clean))

460

### Assessment

- need tcg player urls
- need a name list

## Clean

### Define
- write a loop that pulls out names and tcg player purchase url

### Code

In [48]:
# loop that pulls out names and tcg player purchase url
tcg_player_card_urls = []
set_names = mtg_archive_clean.keys()
tcg_names = []
card_errors = []
for i in set_names:
    set_cards = mtg_archive_clean[i].get('cards')
    set_length = np.count_nonzero(mtg_archive_clean[i]['cards'])
    for a in np.arange(set_length):
        tcg_player_card_urls.append(set_cards[a].get('purchaseUrls')['tcgplayer'])
        tcg_names.append(set_cards[a]['name'])

TypeError: 'NoneType' object is not subscriptable

In [53]:
mtg_archive_clean['10E']['cards'][0].get('purchaseUrls')

{'cardmarket': 'https://mtgjson.com/links/2b93f05911c57179',
 'mtgstocks': 'https://mtgjson.com/links/34f62da2f8c3ee65',
 'tcgplayer': 'https://mtgjson.com/links/f4690a893091ee2d'}

### Test

In [39]:
len(list(mtg_archive_clean['10E'].items()))

18

In [21]:
np.count_nonzero(tcg_player_card_urls), np.count_nonzero(tcg_player_card_urls)

(0, 0)

In [22]:
tcg_names[0], tcg_player_card_urls[0]

IndexError: list index out of range

### Define
- combine names with urls to make tcg_cards

### Code

In [None]:
# code to make a dataframe of the two tcg columns to make tcg_names dataframe
tcg_names = pd.DataFrame(data={'name': tcg_names})
tcg_names['url'] = tcg_player_card_urls

### Test

In [None]:
tcg_names.head()

### Define
- get all the names and rarities 

### Code

In [None]:
mtg_rarities = pd.DataFrame(name = [], 
                            rarity = [])

for i in mtg_archive_clean.keys():
    names = mtg_archive_clean[i]['name']
    rarities = mtg_archive_clean[i]['rarity']

# standard_sets

## Gather

In [16]:
# pull standard cards from mtgjson
standard_sets = requests.get('https://www.mtgjson.com/json/Standard.json')
standard_sets_clean = standard_sets.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# code to save standard_sets as a json file 
with open('standard_sets_clean.txt', 'w') as outfile:
    json.dump(standard_sets_clean, outfile)

In [None]:
# code to load standard_sets_clean
with open('standard_sets_clean.txt', encoding='utf-8') as file:
    standard_sets_clean = json.load(file)

## Assess

In [None]:
list(standard_cards_clean.items())[3]

In [None]:
standard_sets_clean['ELD']['cards'][1]['rarity']

### Assessment
- need names and rarity of the card in one dataset

## Clean

### Define
- use pandas to tidy the data by making it a dataframe with only two variables: name and rarity

### Code

In [None]:
# code to get rarity in a list by themselves
card_rarities = []
for i in standard_sets_clean.keys():
    current_set = standard_sets_clean[i]['cards']
    for a in np.arange(np.count_nonzero(current_set)):
        rarity = current_set[a]['rarity']
        card_rarities.append(rarity)

In [None]:
# code to get names in a list by themselves
card_names = []
for i in standard_sets_clean.keys():
    current_set = standard_sets_clean[i]['cards']
    for a in np.arange(np.count_nonzero(current_set)):
        name = current_set[a]['name']
        card_names.append(name)

In [None]:
# code to create dataframe of card_rarities and card_names
mtg_card_rarities = pd.DataFrame({'name': card_names, 'rarity': card_rarities})
mtg_card_rarities_clean = mtg_card_rarities.copy()
mtg_card_rarities_clean.to_csv('mtg_card_rarities_clean.csv', index=False)

### Test

In [None]:
card_rarities

In [None]:
card_names

In [None]:
np.count_nonzero(card_rarities), np.count_nonzero(card_names)

In [None]:
mtg_card_rarities_clean

In [None]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')

### Define
- drop duplicates

### Code

In [None]:
# drop duplicates in mtg_card_rarities_clean
mtg_card_rarities_clean.drop_duplicates(inplace=True)

In [None]:
mtg_card_rarities_clean.to_csv('mtg_card_rarities_clean.csv', index=False)

### Test

In [None]:
mtg_card_rarities_clean.duplicated().sum()

In [None]:
mtg_card_rarities_clean = pd.read_csv('mtg_card_rarities_clean.csv')
print(mtg_card_rarities_clean)

# deck_type_urls

## Gather

In [None]:
# make a webcrawler to gather the data from mtgtop8.com on competitive decks
url = 'https://www.mtgtop8.com/format?f=ST'
url_standard_page_response = requests.get(url)

In [None]:
# save the response
with open('url_standard_page_response.html', mode='wb') as file:
    file.write(url_standard_page_response.content)

In [None]:
# open response as a BeautifulSoup object
with open('url_standard_page_response.html') as file:
    soup = BeautifulSoup(file, 'lxml')

In [None]:
url = {'url': soup.find_all(href=re.compile(r"archetype\?a"))}
deck_type_urls = pd.DataFrame(data=url)

In [None]:
# create column for deck type and clean the column
deck_type_urls.url = deck_type_urls.url.astype('str')
deck_type_urls['type'] = deck_type_urls.url.str.extract('(>.+<)')
deck_type_urls['type'] = deck_type_urls.type.str.replace('>', '')
deck_type_urls['type'] = deck_type_urls.type.str.replace('<', '')
deck_type_urls['type'] = deck_type_urls.type.str.strip()
url_stripper(deck_type_urls['type'], deck_type_urls['url'], '(>.+<)')

In [None]:
# extract url from the url column
deck_type_urls['url'] = deck_type_urls.url.str.extract('(archetype.+f\=ST)')

In [None]:
# add root url to the data
urls = []
types = []

for i in np.arange(deck_type_urls.shape[0]):
    urls.append(str('https://www.mtgtop8.com/') + deck_type_urls.url[i])
    types.append(deck_type_urls.type[i] + str('.html'))

In [None]:
deck_type_urls['url'] = urls
deck_type_urls['type'] = types
deck_type_urls['type'] = deck_type_urls.type.str.replace(' ', '_')
deck_type_urls['type'] = deck_type_urls.type.str.replace('/', '-')

In [None]:
# create web crawler to request html pages from deck_type_urls

# create new path and name it newpath
newpath = str(os.path.abspath(os.curdir)) + '\\type_html_files'

# check to make sure that there is no path that matches newpath and create a folder called html_files if there isn't
if not os.path.exists(newpath):
    os.makedirs(newpath)

# change current directory to newpath
os.chdir(newpath)

# scrape all html files from urls in deck_type_urls and put them in html_files folder
for i in np.arange(deck_type_urls.shape[0]):
    type_html = requests.get(deck_type_urls.url[i])
    with open(deck_type_urls.type[i], 'wb') as file:
        file.write(type_html.content)

# deck_urls

## Gather

In [None]:
# get a list of file names in html_files folder and name it dir_names
dir_names = []
cur_dir = os.path.abspath(os.curdir)

with os.scandir(cur_dir) as folder:
    for file in folder: 
        if file.is_file():
            dir_names.append(file.name)

In [None]:
# get a list of all urls in all files
deck_urls = []
for i in np.arange(np.count_nonzero(dir_names)):
    with open(dir_names[i]) as file:
        soup = BeautifulSoup(file, 'lxml')
    urls = soup.find_all(href=re.compile(r'event\?e\=.+\&d\=.+\&f\=ST'))
    for a in urls:
        deck_urls.append(a)

In [None]:
# get a list of all player names in all files
player_names = []
for i in np.arange(np.count_nonzero(dir_names)):
    with open(dir_names[i]) as file:
        soup = BeautifulSoup(file, 'lxml')
    urls = soup.find_all(href=re.compile(r'search\?player'))
    for a in urls:
        player_names.append(a)

In [None]:
# turn deck urls into a dataframe
deck_urls = {'url': deck_urls}
deck_urls = pd.DataFrame(data=deck_urls)

In [None]:
# get deck names and clean them up
deck_urls.url = deck_urls.url.astype('str')
deck_urls['name'] = deck_urls.url.str.extract('(>.+<)')
deck_urls.name = deck_urls.name.str.replace('>', '')
deck_urls.name = deck_urls.name.str.replace('<', '')
deck_urls.name = deck_urls.name.str.strip()
url_stripper(deck_urls['url'], deck_url['name'], '(>.+<)')

In [None]:
# extract url from the url column
deck_urls['url'] = deck_urls.url.str.extract('(event.+f\=ST)')

In [None]:
# add root url to the data
urls = []
names = []

for i in np.arange(deck_urls.shape[0]):
    urls.append(str('https://www.mtgtop8.com/') + deck_urls.url[i])
    names.append(deck_urls.name[i] + str('.html'))

In [None]:
# clean urls and name
deck_urls['url'] = urls
deck_urls['name'] = names
deck_urls['name'] = deck_urls.name.str.replace(' ', '_')
deck_urls['name'] = deck_urls.name.str.replace('/', '-')

In [None]:
# append player_names to deck_urls
deck_urls['player_name'] = player_names
deck_urls.player_name = deck_urls.player_name.str.extract('(>.+<)')
deck_urls.player_name = deck_urls.player_name.str.replace('>', '')
deck_urls.player_name = deck_urls.player_name.str.replace('<', '')
deck_urls.player_name = deck_urls.player_name.str.strip()
url_stripper(deck_urls['player_name'], deck_urls['player_name'], '(>.+<)')

## decks

In [None]:
# create a web crawler to request html pages from deck_type_urls

# create a new path and name it newpath
oldpath = str(os.path.abspath(os.curdir))
newpath = oldpath.replace('\\type_html_files', '\\deck_html_files')

# check to make sure that there is no path that matches newpath and create a folder called html_files if there isn't
if not os.path.exists(newpath):
    os.makedirs(newpath)

# change the current directory to newpath
os.chdir(newpath)

# scrape all html files from urls in deck_urls and put them in html_files folder
deck_errors = []

for i in np.arange(deck_urls.shape[0]):
    try:
        if filecmp.cmp(deck_urls.name[i]) == True:
            deck_html = requests.get(deck_urls.url[i])
            with open(deck_urls.name[i], 'wb') as file:
                file.write(deck_html.content)
        else:
            deck_html = requests.get(deck_urls.url[i])
            with open(deck_urls.name[i] + str(i))
    except Exception as e:
        print(str(i) + ' ' + str(e))
        deck_errors.append(i)

In [None]:
filecmp.cmp(deck_urls.name[1], deck_urls.name[2])

In [None]:
os.path.abspath(os.curdir)

In [None]:
os.chdir('C:\\Users\\muroc\\Documents\\MTG\\type_html_files')