# Segmenting and Clustering Neighborhoods in Toronto

#### Import all necessary libraries for the data analysis

In [48]:
#improt libraries
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup as bs

#### Import the web link

In [49]:
#get the web link
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source=requests.get(link).text
web=bs(source,'lxml')

#### Screp the table from the page and load

In [50]:
#extrect the table from the webpage
table1=web.find('tbody')
data_table=[]
for data in table1.find_all('tr'):
    data_table.append(data)

#### Load the data to a data frame

In [51]:
#loading to dataframe
data_list=[]
for data in data_table:
    v=data.text.split('\n')
    v.remove('')
    v.remove('')
    data_list.append(v)
toro_data=pd.DataFrame(data_list)
toro_data.rename(columns=toro_data.iloc[0,:],inplace=True)
toro_data.drop(toro_data[toro_data['Borough'] == 'Borough'].index, inplace = True)
toro_data.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


#### Clean the Data
 Ignore cells with a borough that is Not assigned.
 If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [52]:
#Drop invalied data
toro_data.drop(toro_data[toro_data['Borough'] == 'Not assigned'].index, inplace = True)
for index,data in enumerate(toro_data['Borough']):
    if toro_data.iloc[index,2]=='Not assigned':
        toro_data.iloc[index,2].replace('Not assigned',data)
toro_data.reset_index(inplace=True)
toro_data.drop('index',axis=1,inplace=True)
toro_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [53]:
#Define the list to String
def ListToString(list):
    string=""
    for word in list:
        string=string+word+','
        
    re=''
    for i in range(len(string)):
        if i!=(len(string)-1):
            re=re+string[i]
    return re


#### Group all neighborhood can exist in one postal code area.

In [54]:
#Clusting the neighbourhood
postcode=toro_data['Postcode'].unique()
toro_post=pd.DataFrame({'Postcode':[],'Borough':[],'Neighbourhood':[]})



for pc in postcode:
    Bor=[]
    Nei=[]
    for index,data in enumerate(toro_data['Postcode']):
        if pc==data:
            Bor=toro_data.iloc[index,1]
            Nei.append(toro_data.iloc[index,2])
                    
    Nei_string=ListToString(Nei)
    df=pd.DataFrame({'Postcode':[pc],'Borough':Bor,'Neighbourhood':[Nei_string]})
    toro_post=pd.concat([toro_post,df],ignore_index=True)


toro_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [55]:
toro_post.shape

(103, 3)

In [56]:
#loading the coodinates csv file
geo_df=pd.read_csv("https://cocl.us/Geospatial_data")
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [57]:
toro_post['Latitude']=np.nan
toro_post['Longitude']=np.nan
for index,data in enumerate(toro_post['Postcode']):
    for i,ps in enumerate(geo_df['Postal Code']):
        if data==ps:
            toro_post['Latitude'][index]=geo_df.iloc[i,1]
            toro_post['Longitude'][index]=geo_df.iloc[i,2]
            
    
toro_post

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toro_post['Latitude'][index]=geo_df.iloc[i,1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toro_post['Longitude'][index]=geo_df.iloc[i,2]


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509


#### Import the libraries for map

In [59]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

#### Get the Toronto geo data

In [60]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


#### Plot the map

In [67]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toro_post['Latitude'], toro_post['Longitude'], toro_post['Borough'], toro_post['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='Red',
        fill=True,
        fill_color='#158',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork