# Segmenting and Clustering Neighborhoods in Toronto - Part III

### Importing Libraries

In [2]:
from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.\nAll set.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

   

### Set the data path and start BeautifulSoup

In [3]:
data_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(data_url)
soup = BeautifulSoup(page.text, 'html.parser')

### Filling a list with HTML values, while formatting to sting

In [4]:
dados = []

for i in range(0,289):
    a = soup.find_all('tr')[i].text
    if not a is None:
        a = a.strip()
        a = a.replace('\n', ',').split(',')
    dados.append(a)

### Setting first row as header

In [5]:
df_wiki = pd.DataFrame(dados)
new_header = df_wiki.iloc[0]
df_wiki = df_wiki[1:] 
df_wiki.columns = new_header 

### Replacing "Not assigned" Neighbourhood values with Borough values

In [6]:
df_wiki['Neighbourhood'] = np.where(df_wiki['Neighbourhood'] == 'Not assigned', df_wiki['Borough'], df_wiki['Neighbourhood'])

### Concatenate Neighbourhoods with the same Borough separated with commas, and removing "Not assigned" Boroughs


#### Also sorting values by Postcode

In [7]:
df_final = df_wiki.groupby(['Postcode','Borough'],as_index=False).agg(lambda x : x.sum() if x.dtype=='float64' else ', '.join(x))
df_final.replace('Not assigned', np.nan, inplace=True)
df_final.dropna(0, inplace=True)
df_final.reset_index(inplace=True)
df_final.sort_values('Postcode', inplace=True)
df_final.drop('index', axis=1, inplace=True)

## Data Frame

In [8]:
df_final

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Shape of Data Frame

In [9]:
df_final.shape

(103, 3)

# Adding Latituge and Longitude info from CSV file
### Set file path to read and sorting values by Postal Code

In [10]:
lat_long = pd.DataFrame(pd.read_csv('https://cocl.us/Geospatial_data'))
lat_long.sort_values('Postal Code', inplace=True)

### Add Latitude and Longitude to main Data Frame

In [11]:
df_final['Latitude'], df_final['Longitude']  = lat_long['Latitude'], lat_long['Longitude']
df_final

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Shape of Data Frame

In [12]:
df_final.shape

(103, 5)

In [15]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Plotting

In [47]:
df_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(df_map)  
    
df_map