In [2]:
# Load all packages necessary for analysis
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import re
import os
import timeit
import filecmp

# decks_clean

## Gather

In [62]:
dir_names = []
curdir = 'C:/Users/muroc/Documents/MTG/html_files'

with os.scandir(curdir) as folder:
    for file in folder:
        if file.is_file():
            dir_names.append(file.name)

In [63]:
decks = []
for i in np.arange(np.count_nonzero(dir_names)):
    with open(str(curdir) + '/' + str(dir_names[i])) as file:
        soup = BeautifulSoup(file, 'lxml')
    cards = soup.find(lambda tag: tag.name=='input' and tag.has_attr('name') and tag['name']=='c')
    decks.append(cards)

In [64]:
deck_urls = pd.read_csv('~/Documents/MTG/data/deck_urls.csv')

In [65]:
decks_dict = {"deck_list": decks}
decks_clean = pd.DataFrame(decks_dict)

In [66]:
decks_clean.to_csv('~/Documents/MTG/data/deck_list.csv', index=False)

## Assess

In [82]:
deck_urls.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


In [83]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


### Assessment
- [done] data from deck_urls.csv and deck_list.csv are of same object
- [done] .html and .txt tags are irrelevant in name column
- [done] `deck_list` may be type that is irrelevant
-  `deck_list` column is one huge object
- irrelevant characters in `deck_list` column
- [done] deck_urls needs a copy to work with
- multiple headers in `url` column because i must have ran the function too often

## Clean

#### Define
- append deck_urls.csv and deck_list.csv together

#### Code

In [68]:
deck_urls['deck_list'] = decks_clean

In [70]:
deck_urls.to_csv('~/Documents/MTG/data/decks_clean.csv')

#### Test

In [69]:
deck_urls.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- create a copy of `deck_urls`

#### Code

In [71]:
decks_clean = deck_urls.copy()

#### Test

In [72]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals.html.txt,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath.html.txt,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff.html.txt,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun.html.txt,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy.html.txt,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- change type of `deck_list` to str

#### Code

In [73]:
decks_clean.deck_list = decks_clean.deck_list.astype('str')

In [74]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv')

#### Test

In [75]:
type(decks_clean.deck_list[0])

str

#### Define
- drop .url and .text from the `name` column

#### Code

In [78]:
decks_clean.name = decks_clean.name.str.replace('.html.txt', '')

In [81]:
decks_clean.to_csv('~/Documents/MTG/data/decks_clean.csv')

#### Test

In [79]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,"<input name=""c"" type=""hidden"" value=""13 Forest..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,"<input name=""c"" type=""hidden"" value=""10 Forest..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,"<input name=""c"" type=""hidden"" value=""2 Essence..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,"<input name=""c"" type=""hidden"" value=""1 Planar ..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,"<input name=""c"" type=""hidden"" value=""6 Forest|..."


#### Define
- get rid of all unneccessary characters in `deck_list`

#### Code

In [84]:
decks_clean.deck_list = decks_clean.deck_list.str.replace('<input name="c" type="hidden" value=', '')

In [87]:
decks_clean.deck_list = decks_clean.deck_list.str.strip('"')

#### Test

In [88]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,13 Forest||4 Llanowar Elves||3 Greenbelt Rampa...
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,10 Forest||7 Island||4 Llanowar Elves||4 Breed...
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,2 Essence Scatter||4 Glacial Fortress||7 Islan...
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,1 Planar Cleansing||2 Forest||2 Island||1 Plai...
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,6 Forest||4 Swamp||4 Blood Crypt||4 Cauldron F...


#### Define
- split deck_list into multiple objects

#### Code

In [93]:
decks_clean.deck_list = decks_clean.deck_list.str.split(r'\|\|')

#### Test

In [94]:
decks_clean.head()

Unnamed: 0,url,name,player,deck_list
0,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Elementals,Robert_Lippmann,"[13 Forest, 4 Llanowar Elves, 3 Greenbelt Ramp..."
1,https://www.mtgtop8.com/https://www.mtgtop8.co...,Omnath,Clarence_Dews,"[10 Forest, 7 Island, 4 Llanowar Elves, 4 Bree..."
2,https://www.mtgtop8.com/https://www.mtgtop8.co...,4colour_Good_Stuff,Yohan_Dudognon,"[2 Essence Scatter, 4 Glacial Fortress, 7 Isla..."
3,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Sun,Aaron_Barich,"[1 Planar Cleansing, 2 Forest, 2 Island, 1 Pla..."
4,https://www.mtgtop8.com/https://www.mtgtop8.co...,4c_Energy,rushi_00,"[6 Forest, 4 Swamp, 4 Blood Crypt, 4 Cauldron ..."
