## Segmenting and Clustering Neighborhoods in Toronto

#### Assignment Notebook

In [3]:
import numpy as np 
import pandas as pd 

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
import lxml.html as lh # to parse the relevant fields
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

In [19]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 18.7MB/s 
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [5]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create a handle, page, to handle the contents of the website
page = requests.get(url)

# Store the contents of the website under doc
doc = lh.fromstring(page.content)

# Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [6]:
# Parse first row as header

tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print (i,name)
    col.append((name,[]))

1 Postcode
2 Borough
3 Neighbourhood



In [7]:
# Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [10]:
# Create the dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [11]:
# Remove unwanted characters
df.rename(columns = {'Neighbourhood\n':'Neighbourhood'}, inplace = True)
df = df.replace({'\n':''}, regex=True)

In [12]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [13]:
# Drop rows with a 'Not assigned Borough'
df=df[df.Borough != 'Not assigned']

In [14]:
# Merge postocodes with more than one neighborhood

df=df.groupby('Postcode').agg({'Borough':lambda x:x.max(), 'Neighbourhood':lambda x:', '.join(x)})

# reset index
df=df.reset_index()

In [15]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
# Find which cells have a borough but a 'Not assigned' neighborhood
df[df['Neighbourhood'].str.contains("Not assigned")]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [17]:
# Making the neighborhood same as the borough
df.loc[85,'Neighbourhood']=df.loc[85,'Borough']

In [18]:
df.shape[0]

103

In [None]:
# Obtain the latitude and the longitude coordinates of each neighborhood
# using the  Geocoder Python package

#import geocoder # import geocoder

#t#df.shape[0]

#for i in range(0,t):
    # initialize your variable to None
    #lat_lng_coords = None
    #postalcode=df.loc[i,'Postcode']
    #print (postalcode)
    # loop until you get the coordinates
    
    #while(lat_lng_coords is None):
        #g = geocoder.google('{},Toronto, Ontario'.format(postalcode))
        #lat_lng_coords = g.latlng
    
        #df.loc[i,'Latitude'] = lat_lng_coords[0]
        #df.loc[i,'Longitude'] = lat_lng_coords[1]
# DOESN"T RETURN ANYTHING

In [23]:
# Geocoder doesn't appear to return anything, reverting to using the csv

!wget -O geospatial_coord.csv http://cocl.us/Geospatial_data

--2018-10-25 07:48:59--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2018-10-25 07:48:59--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-10-25 07:49:00--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2018-10-25 07:49:00--  https://ibm.ent.box.com/shared/

In [31]:
df_coord = pd.read_csv('geospatial_coord.csv')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [39]:
# Rename Postal Code column
df_coord.rename(columns={"Postal Code": "Postcode"},inplace=True)

In [41]:
# Merge the two dataframes
df_final=pd.merge(df, df_coord, on='Postcode')

In [42]:
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
