# Segmenting & Clustering Neighborhoods in Toronto, Canada
***

In [1]:
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1. Scrape Wikipedia Page
***

In [2]:
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wiki_page = requests.get(wiki_link)
page = raw_wiki_page.text

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page,'lxml')
#print(soup.prettify()) #Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document.

In [4]:
#all the table contents which we intend to extract is under class Wikitable Sortable. Let's find it.
My_table = soup.find('table',{'class':'wikitable sortable'})

In [5]:
#from above output, we can see that data exists in <td> tag.
content = My_table.findAll('td')

In [6]:
#obtain data for preparing the dataframe
body = []
for name in content:
    body.append(name.text.strip())

In [7]:
body[0:10]

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A']

In [8]:
filtered_data = []
repeat=[]
for i in range(0,len(body),3):
    if body[i+1]=='Not assigned':
        continue                   #continue the loop if empty Borough is encountered
    else:
        if body[i] not in repeat:  #check if Postal Code is already mapped or not
            n = i+3
            add=[]
            check = body[i]
            while n<len(body):
                if body[n]==check:
                    add.append(body[n+2]) #adding additional Neighborhood for current Postal Code
                n=n+3
            if add:
                repeat.append(body[i]) #keeping note of the repeated Postal Code in table
                filtered_data.append([body[i],body[i+1],body[i+2]+', '+', '.join(add)])
            else:
                if body[i+2]=='Not assigned': #check if Neighborhood is assigned or not
                    filtered_data.append([body[i],body[i+1],body[i+1]])
                else:
                    filtered_data.append([body[i],body[i+1],body[i+2]])
        else:
            continue

filtered_data[0:10]   

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront, Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights, Lawrence Manor'],
 ['M7A', "Queen's Park", "Queen's Park"],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge, Malvern'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens, Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson, Garden District']]

In [9]:
#Convert processed data into required Data Frame format
col_names = ['PostalCode','Borough', 'Neighborhood']
neighborhoods = pd.DataFrame(columns=col_names, data=filtered_data)

neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [10]:
print("Number of rows in Neighborhoods dataframe = ", neighborhoods.shape[0])

Number of rows in Neighborhoods dataframe =  103


# Part 2 of the Assignment
## 2. Obtain Coordinates for each Neighborhood
***

In [11]:
#Load data from csv file that has the geographical coordinates of each postal code
lat_lon = pd.read_csv("Geospatial_Coordinates.csv")
lat_lon.head(3)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [12]:
#Add new columns to the dataframe
neighborhoods['Latitude'] = ""
neighborhoods['Longitude'] = ""
neighborhoods.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",,


In [13]:
#the below code converts Postal Code to index and creates a dictionary with Postal code as keys and \
#latitude and longitude as values
ref = lat_lon.set_index('Postal Code').T.to_dict('list')

#add coordinates to the dataframe
for i in range(0,neighborhoods.shape[0]):
    neighborhoods.Latitude[i] = ref[neighborhoods.iloc[i][0]][0] 
    neighborhoods.Longitude[i] = ref[neighborhoods.iloc[i][0]][1]

neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
7,M3B,North York,Don Mills North,43.7459,-79.3522
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.7064,-79.3099
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6572,-79.3789
