# Applied Data Science
## Capstone Project Notebook : Places that contain "York"

In [176]:
# @hidden_cell
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import numpy as np
import json, requests
from bs4 import BeautifulSoup
import requests
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3
import folium


def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

In [177]:
body = client_7be38dd215fa47eabb3601eafc86c7be.get_object(Bucket='datascience-donotdelete-pr-az7itulpgktwpq',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_geocodes = pd.read_csv(body)
df_geocodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [178]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table_url = requests.get(url)
html_doc = table_url.text

### Get location data from Wikipedia

In [179]:
soup = BeautifulSoup(html_doc, 'html.parser')
locations_table = soup.find("table", attrs={"class": "wikitable sortable"})
locations_table_data = locations_table.tbody.find_all("tr")
# Get all the headings of Lists
headings = []
for td in locations_table_data[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    headings.append(td.text.strip('\n'))
locations_data = locations_table_data[1:]
location_rows = []
for row in range(len(locations_data)):
    loc_row = []
    for td in locations_data[row].find_all("td"):
        loc_row.append(td.text.strip('\n'))
    location_rows.append(loc_row)   

### Obtain Data from Scapped Table

In [180]:
df = pd.DataFrame(location_rows, columns=headings)
df.head(3)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods


### Drop borough if Not assigned
#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [181]:
df = df.drop(df[df["Borough"] == 'Not assigned'].index)
df.head(3)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### If Borough but no Neighbourhood then Assign Borough

In [182]:
df[df.Neighbourhood == 'Not assigned']
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df["Borough"], df['Neighbourhood'])
df.head(3)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Find duplicate Postal Code and Replace/Add Neighbourhood

In [183]:
duplicatePostalCode = df[df.duplicated(['Postal Code'])].sort_values(by=['Postal Code'])
duplicatePostalCode.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood


### Linking the Postal Codes to Latitude/Longitude

In [208]:
df_with_geo_codes = pd.merge(df,df_geocodes, on='Postal Code')

In [209]:
df_with_geo_codes.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Displaying Boroughs that have the word "York"in them

In [206]:
df_with_york = df_with_geo_codes[df_with_geo_codes['Borough'].str.contains("York")]
latitude = 43.753259
longitude = -79.329656
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
for lat, lng, label in zip(df_with_york['Latitude'], df_with_york['Longitude'], df_with_york['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=20,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)    
map_toronto

In [205]:
df_with_york.shape

(34, 5)