# Segmenting and Clustering Neighborhoods in Toronto

## Import libraries

In [39]:
import pandas as pd
import numpy as np

## Parse HTML page into pandas Dataframes

In [40]:
dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [41]:
# Select the desired DataFrame
df = dfs[0]

In [42]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


## Drop rows that have a Borough Not assigned

In [43]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df.Borough != "Not assigned"]

In [44]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## If a Neighbourhood has a Not Assigned value, replace it with the value of Borough

In [45]:
df["Neighbourhood"] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## Group By Postcode and join Neighbourhood values separated by comma

In [47]:
nb = df.groupby('Postcode')['Neighbourhood'].agg([('Neighbourhood', ', '.join)]).reset_index()
nb = nb.merge(df, on="Postcode")
nb.drop("Neighbourhood_y", axis=1, inplace=True)
nb.columns = ["Postcode", "Neighbourhood", "Borough"]
nb.drop_duplicates(inplace=True)

In [48]:
nb

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
5,M1E,"Guildwood, Morningside, West Hill",Scarborough
8,M1G,Woburn,Scarborough
9,M1H,Cedarbrae,Scarborough
...,...,...,...
195,M9N,Weston,York
196,M9P,Westmount,Etobicoke
197,M9R,"Kingsview Village, Martin Grove Gardens, Richv...",Etobicoke
201,M9V,"Albion Gardens, Beaumond Heights, Humbergate, ...",Etobicoke


In [51]:
nb.head(10)

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
5,M1E,"Guildwood, Morningside, West Hill",Scarborough
8,M1G,Woburn,Scarborough
9,M1H,Cedarbrae,Scarborough
10,M1J,Scarborough Village,Scarborough
11,M1K,"East Birchmount Park, Ionview, Kennedy Park",Scarborough
14,M1L,"Clairlea, Golden Mile, Oakridge",Scarborough
17,M1M,"Cliffcrest, Cliffside, Scarborough Village West",Scarborough
20,M1N,"Birch Cliff, Cliffside West",Scarborough


## Print shape of DataFrame

In [52]:
nb.shape

(103, 3)