# __Neighbourhood segmentation and clustering in Toronto__   

# 1. Get a Dataframe from a table
## _1.1. Creating a Dataframe from Toronto Post Codes table in Wikipedia_
__We use the BeautifulSoup library__

First, [install and] import dependancies:

In [1]:
#!pip install beautifulsoup4
#!pip install lxml
#!pip show beautifulsoup4

In [2]:
from bs4 import BeautifulSoup
import requests
import lxml
#import html5lib
import pandas as pd
pd.set_option('precision', 8)

__The site we want to parse is [here](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)__  

In [3]:
wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
# Getting webpage to 'requests' object
raw_page = requests.get(wikipedia_link)

In [5]:
# Initialization of 'BeautifulSoup' object with parser 'lxml'
# and passing the text from 'requests' object as input
soup = BeautifulSoup(raw_page.text,'lxml')

In [6]:
toronto = soup.table
#print(toronto.prettify())

The structure of the table is as follows:

< tr >    
< td >  
  Postcode  
 < /td >  
 < td >  
  Borough  
 < /td >  
 < td >  
  Neighbourhood  
 < /td >  
< /tr >  

Now we can create separate lists for each column and populate them in a loop

In [7]:
postcode = []
borough = []
neighbourhood = []

for row in toronto.find_all('tr'):
    try:
        postcode.append(row.find_all('td')[0].text)
        borough.append(row.find_all('td')[1].text)
        neighbourhood.append(row.find_all('td')[2].text)
    except:
        #ignore the 1st row tagged with <tr>
        pass

In [8]:
#Check the equality of arrays' lengths

print('postcode: ',len(postcode))
print('borough: ',len(borough))
print('neighbourhood: ',len(neighbourhood))

postcode:  288
borough:  288
neighbourhood:  288


Now we can create and preview Dataframe:

In [9]:
df = pd.DataFrame({'Postcode':postcode,'Borough':borough,'Neighbourhood':neighbourhood})
df['Neighbourhood'].replace(regex='\\n',value='',inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## _1.2. Cleaning the table_

__a) Ignore cells with a borough that is Not assigned:__

In [10]:
df = df[df['Borough'] != 'Not assigned']
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


__b) We have several neighborhoods for some of postcodes:__

In [11]:
print(len(df['Postcode'].unique()))
print(len(df['Neighbourhood'].unique()))

103
209


Let's combine them:

In [12]:
df = df.groupby('Postcode').agg(
    {'Borough': lambda x: list(x)[0],
     'Neighbourhood': lambda x:', '.join(map(str, list(x)))}).reset_index()

In [13]:
print(len(df['Postcode'].unique()))
print(len(df['Neighbourhood'].unique()))
df.head()

103
103


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


__c) Let's find 'Not assigned' Neighborhoods and assign them the borough name:__

In [14]:
df['Neighbourhood'].replace('Not assigned',value=df['Borough'],inplace=True)

In [15]:
df.shape

(103, 3)

# 2. Getting coordinates from address (geocoding)

In [16]:
!pip install geocoder
#!pip install geopy
#from geopy.geocoders import Nominatim # import geocoder
from map_api import mapquest #import credentials from config file



In [17]:
import geocoder
import numpy as np

__Add and initialize two new columns__

In [18]:
df['Latitude'] = 'NA'
df['Longitude'] = 'NA'

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",,
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",,
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,,
4,M1H,Scarborough,Cedarbrae,,


__Extracting locations via Mapquest API__

In [19]:
mq = mapquest()
#mq.secret

In [20]:
for row in df.index:
    location = None
    postcode = df.loc[row]['Postcode']
    location = geocoder.mapquest('{}, Toronto, Ontario'.format(postcode),key=mq.key,maxRows=5)          
    
    try:
        df.loc[row]['Latitude'] = np.mean(np.asarray(location.lat))
        df.loc[row]['Longitude'] = np.mean(np.asarray(location.lng))
    except:
        pass

In [21]:
location = geocoder.mapquest('{}, Toronto, Ontario'.format('M1H'),key=mq.key,maxRows=5)
location.lat

43.651893

In [22]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.81302,-79.2432
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.79388,-79.12455
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76812,-79.19745
3,M1G,Scarborough,Woburn,43.651893,-79.381713
4,M1H,Scarborough,Cedarbrae,43.651893,-79.381713
5,M1J,Scarborough,Scarborough Village,43.651893,-79.381713
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7435,-79.26414
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.651893,-79.381713
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.71709,-79.24936
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.651893,-79.381713


__Getting coordinates from .csv file__  
Since Mapquest coordinates here seems to be not very precise, let's apply ready data

In [23]:
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.8066863,-79.1943534
1,M1C,43.7845351,-79.1604971
2,M1E,43.7635726,-79.1887115
3,M1G,43.7709921,-79.2169174
4,M1H,43.773136,-79.2394761


In [24]:
df1 = df[['Postcode','Borough','Neighbourhood']].join(geo.set_index('Postal Code'), on='Postcode',)
df1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.2648481
