<h1 align=center><font size = 5>Segmenting and Clustering Neighborhoods in Toronto</font></h1> 

<h3 align=left><font size = 5>1. Import libraries</font></h3> 

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# from bs4 import BeautifulSoup
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import requests # Requests allows you to send organic, grass-fed HTTP/1.1 requests, without the need for manual labor.

print('Libraries imported.')

In [None]:
!conda install lxml --yes

In [None]:
!conda update -c conda-forge -y pandas

In [None]:
conda install -c anaconda beautifulsoup4

In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

<h2 align=left><font size = 5>2. Scraping table from Wikipedia page </font></h2> 
<h3 align=left><font size = 3>Use the BeautifulSoup package or <b>any other way</b> you are comfortable with </font></h3> 
<h3 align=left><font size = 3>I am using pd.read_html </font></h3> 



In [38]:
df = pd.read_html(r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',match= 'Neighborhood')
df = df[0]
df = pd.DataFrame(df) 
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


<h3 align=left><font size = 3> - The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood <br>
- Ignore cells with a borough that is Not assigned. </font></h3> 

In [39]:
df.columns= ["PostalCode","Borough","Neighborhood"] # The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [40]:
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True) # Ignore cells with a borough that is Not assigned.

In [41]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


<h3 align=left><font size = 3> Merge cells with same postal code. <br>
More than one neighborhood can exist in one postal code area, separated them with ","  </font></h3> 

In [42]:
df=df.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: ','.join(x)) #Merge cells with same postal code.
#More than one neighborhood can exist in one postal code area, separated them with ","
df["Neighborhood"]= df["Neighborhood"].str.replace(" / ", ", ")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3 align=left><font size = 3> If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.  </font></h3> 

In [43]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
e = len(df)
i = 1
while i < e:
    y = df.iat[i,2]
    
    if y == 'Not assigned':
        df.iat[i,2] = df.iat[i,1]
    i += 1
df.head()
# See Queen's Park (last row) for example

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h3 align=left><font size = 3> Show final result - with 15 random rows</font></h3> 

In [44]:
df_random = df.sample(n=15)
df_random

Unnamed: 0,PostalCode,Borough,Neighborhood
80,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
20,M2L,North York,"York Mills, Silver Hills"
28,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North"
69,M5W,Downtown Toronto,Stn A PO Boxes
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
35,M4B,East York,"Parkview Hill, Woodbine Gardens"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
4,M1H,Scarborough,Cedarbrae
29,M3J,North York,"Northwood Park, York University"
27,M3C,North York,Don Mills


<h3 align=left><font size = 3> Use the .shape method to print the number of rows of your dataframe.  <br>
Total rows: 103</font></h3> 

In [45]:
df.shape[0]

103

<h3 align=left><font size = 3> Save Dataframe to CSV file</font></h3> 


In [46]:
df.to_csv(r'Toronto.csv', index = False)

In [47]:
coordinate = pd.read_csv('Geospatial_Coordinates.csv') # downloaded file from https://cocl.us/Geospatial_data
df=df.merge(coordinate,how='left',left_on='PostalCode',right_on='Postal Code')
del df['Postal Code'] # get rid of additional column
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
