In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests # library to handle requests

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  24.42 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  35.71 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.40 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  31.97 MB/s
vincent-0.4.4- 100% |###################

In [2]:
# Download and Explore Dataset
html_data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(html_data.content, 'lxml')

In [3]:
#Add headers to table
import pandas as pd
header_list = []
table_soup = soup.find('table', class_='wikitable sortable')
for header in table_soup.find_all('th'):
    header_list.append(header.text.strip())

table_df = pd.DataFrame(columns=header_list)
table_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood


In [4]:
#Add postcodes Datasets
#for row in table
for tr in table_soup.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    pd, bo, ne = [td.text.strip() for td in tds[:3]]
    #row append([pd, bo, ne])
    table_df = table_df.append({header_list[0]:pd, header_list[1]:bo, header_list[2]:ne}, ignore_index=True)


table_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# Clean up data
#1. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
# replace "Not assigned" to NaN
table_df['Borough'].replace("Not assigned", np.nan, inplace = True)

# Drop whole row with NaN in "price" column
table_df.dropna(subset=["Borough"], axis=0, inplace=True)
table_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [None]:
#2. More than one neighborhood can exist in one postal code area. These rows will be combined into one row with the neighborhoods separated with a comma
table_df=table_df.groupby("Postcode").agg(lambda x:','.join(set(x)))
table_df.reset_index(inplace=True)
table_df.head()

In [None]:
#3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
table_df.loc[table_df['Neighbourhood']=="Not assigned",'Neighbourhood']=table_df.loc[table_df['Neighbourhood']=="Not assigned",'Borough']
table_df.head()

In [None]:
#4. Use the .shape method to print the number of rows of your dataframe
table_df.shape

In [None]:
# Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, 
# in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

import pandas as pd
filename = "https://cocl.us/Geospatial_data"
headers = ["Postcode", "Latitude", "Longitude"]

df = pd.read_csv(filename)
df.rename(columns={'Postal Code':'Postcode'}, inplace=True)
#df.set_index("Postcode", inplace=True)
df.head()

In [None]:
#Add Latitude and Longitude

tmp = pd.merge(table_df, df, on='Postcode', how='left')
tmp.head()