# Capstone Week 3 part 1: Wikipedia Web Scraper

Let's start by scraping our columns from Wikipedia's table:

In [2]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

try:
    webpage = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
    doc = urllib.request.urlopen(webpage)
except:
    print('Unable to connect to page, please try again')
    quit()
#initializing the columns and Soup object
soup = BeautifulSoup(doc, 'html.parser')
PostalCode = []
Borough = []
Neighborhood = []

#loads items into their columns
count = 1
for item in soup.find_all('td')[:540]:
    item = item.get_text().strip()
    if count == 1:
        PostalCode.append(item)
        count += 1
        continue
    if count % 2 == 0:
        Borough.append(item)
        count += 1
        continue
    if count % 3 == 0:
        Neighborhood.append(item)
        count = 1
        continue



Now that the columns are built, let's go ahead and build the Dataframe

In [4]:
df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Now we clean it up, it is important to note that the some of the tasks are already done for us by 
wikipedia, the assignment appears to be out of date. This means that the final shape may not match
what the course wants.

In [5]:
#removes incomplete data
df_clean = df.loc[df['Borough'] != 'Not assigned']
df_clean.reset_index(drop = True, inplace = True)

#add commas
target = df_clean['Neighborhood'].values
for index, item in enumerate(target):
    tab = item.maketrans('/', ',')
    target[index] = item.translate(tab)
df_clean.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District , Ryerson"


In [110]:
df_clean.shape

(103, 3)

# End of part 1, start of part 2



In [11]:
import pandas as pd

geodata = pd.read_csv('Geospatial_Coordinates.csv')
geodata.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
geodata.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now we are ready to join the tables


In [17]:
complete_set = df_clean.set_index('PostalCode').join(geodata.set_index('PostalCode'))
complete_set.reset_index(inplace = True)
complete_set.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


# End of part 2, start of part 3