In [1]:
# Load all packages necessary for analysis
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import timeit
import filecmp

In [2]:
decks_clean = pd.read_csv('~/Documents/MTG/data/decks_clean.csv')

# decks_clean

## Gather

In [3]:
dir_names = []
curdir = 'C:/Users/muroc/Documents/MTG/html_files'

with os.scandir(curdir) as folder:
    for file in folder:
        if file.is_file():
            dir_names.append(file.name)

In [4]:
decks = []
for i in np.arange(np.count_nonzero(dir_names)):
    with open(str(curdir) + '/' + str(dir_names[i])) as file:
        soup = BeautifulSoup(file, 'lxml')
    cards = soup.find(lambda tag: tag.name=='input' and tag.has_attr('name') and tag['name']=='c')
    decks.append(cards)

In [5]:
deck_urls = pd.read_csv('~/Documents/MTG/data/deck_urls.csv')

In [6]:
decks_dict = {"deck_list": decks}
decks_clean = pd.DataFrame(decks_dict)

In [7]:
decks_clean.to_csv('~/Documents/MTG/data/deck_list.csv', index=False)

## Assess

In [8]:
deck_urls.head()

Unnamed: 0,url,name,player
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00


In [9]:
decks_clean.head()

Unnamed: 0,deck_list
0,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


In [38]:
decks_clean.tail()

Unnamed: 0,url,name,player,deck_list
914,https://www.mtgtop8.com/event?e=22351&amp;d=35...,Weenie_White,Conanhawk,['nan']
915,https://www.mtgtop8.com/event?e=22391&amp;d=35...,Weenie_White,Björn_Krusche,['nan']
916,https://www.mtgtop8.com/event?e=22353&amp;d=35...,WW,Drew_Iafrate,['nan']
917,https://www.mtgtop8.com/event?e=22304&amp;d=35...,Weenie_White,Yellowhat,['nan']
918,https://www.mtgtop8.com/event?e=22304&amp;d=35...,Weenie_White,Cornanhawk,['nan']


In [11]:
decks_clean[decks_clean.deck_list == '[nan]']

Unnamed: 0,deck_list


In [12]:
decks_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 874 entries, 0 to 873
Data columns (total 1 columns):
deck_list    874 non-null object
dtypes: object(1)
memory usage: 6.9+ KB


### Assessment
- [done] data from deck_urls.csv and deck_list.csv are of same object
- [done] .html and .txt tags are irrelevant in name column
- [done] `deck_list` may be type that is irrelevant
- [done] `deck_list` column is one huge object
- [done] irrelevant characters in `deck_list` column
- [done] deck_urls needs a copy to work with
- [done] multiple headers in `url` column because i must have ran the function too often
- missing data at the end of the dataset
- [done] deck_list column is not the right data type
- duplicate observations

## Clean

#### Define
- append deck_urls.csv and deck_list.csv together

#### Code

In [13]:
deck_urls['deck_list'] = decks_clean

In [14]:
deck_urls.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [15]:
deck_urls.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- create a copy of `deck_urls`

#### Code

In [16]:
decks_clean = deck_urls.copy()

#### Test

In [17]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- change type of `deck_list` to str

#### Code

In [18]:
decks_clean.deck_list = decks_clean.deck_list.astype('str')

In [19]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [20]:
type(decks_clean.deck_list[0])

str

#### Define
- drop .url and .text from the `name` column

#### Code

In [21]:
decks_clean.name = decks_clean.name.str.replace('.html.txt', '')

In [22]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [23]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- get rid of all unneccessary characters in `deck_list`

#### Code

In [24]:
decks_clean.deck_list = decks_clean.deck_list.str.replace('<input name="c" type="hidden" value=', '')

In [25]:
decks_clean.deck_list = decks_clean.deck_list.str.strip('"')

In [26]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [27]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,13 Forest||4 Llanowar Elves||3 Greenbelt Rampa...
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,10 Forest||7 Island||4 Llanowar Elves||4 Breed...
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,2 Essence Scatter||4 Glacial Fortress||7 Islan...
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,1 Planar Cleansing||2 Forest||2 Island||1 Plai...
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,6 Forest||4 Swamp||4 Blood Crypt||4 Cauldron F...


#### Define
- split deck_list into multiple objects

#### Code

In [28]:
decks_clean.deck_list = decks_clean.deck_list.str.split(r'\|\|')

In [29]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [30]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,"[13 Forest, 4 Llanowar Elves, 3 Greenbelt Ramp..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,"[10 Forest, 7 Island, 4 Llanowar Elves, 4 Bree..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,"[2 Essence Scatter, 4 Glacial Fortress, 7 Isla..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,"[1 Planar Cleansing, 2 Forest, 2 Island, 1 Pla..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,"[6 Forest, 4 Swamp, 4 Blood Crypt, 4 Cauldron ..."


#### Define
- remove first https from `url` column

#### Code

In [31]:
decks_clean.url = decks_clean.url.str.replace('https://www.mtgtop8.com/', '')
decks_clean.url = 'https://www.mtgtop8.com/' + decks_clean.url

In [32]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [33]:
decks_clean.url[0]

'https://www.mtgtop8.com/event?e=23476&amp;d=362227&amp;f=ST'

#### Define
- make `deck_list` column a str

#### Code

In [34]:
decks_clean.deck_list = decks_clean.deck_list.astype('str')

In [35]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv', index=False)

#### Test

In [36]:
type(decks_clean.deck_list[0])

str

#### Define
- figure out how many '[nan]' are in deck_list

#### Code

In [37]:
decks_clean.deck_list.value_counts()

['nan']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     45
['2 Shock', '10 Forest', '9 Mountain', '4 Bonecrusher Giant', '2 Embercleave', '2 Redcap Melee', '4 Once Upon a Time', '4 Questing Beast', '3 Lovestruck Beast', '2 Kraul