# Segmenting and Clustering Neighborhoods in Toronto

In [143]:
import pandas as pd

In [144]:
# set display options
pd.options.display.max_rows = 300
pd.options.display.max_columns  = 200
pd.options.display.max_colwidth = 100

## Read the html table from the Wikipedia page using pandas

In [145]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = pd.read_html(url, attrs={'class': 'wikitable'})

## Get the first dataframe from the resulting list and explore the first few rows

In [146]:
tor_boroughs = results[0]
print('Shape all records',tor_boroughs.shape)
tor_boroughs.head()

Shape all records (287, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Drop rows with Boroughs Not Assigned 

In [147]:
tor_boroughs_drop_na = tor_boroughs.drop(
    tor_boroughs[tor_boroughs.Borough == 'Not assigned'].index,
    axis=0).reset_index(drop=True)
print('Shape after dropping na',tor_boroughs_drop_na.shape)
tor_boroughs_drop_na.head(10)

Shape after dropping na (210, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Not assigned
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


## Combine Neighborghoods where more than one neighborhood exists in one postal code area

In [148]:
postcodes = tor_boroughs_drop_na['Postcode'].unique()

In [149]:
boroughs = []
neighborhoods = []

for code in postcodes:
#     print(code)
    temp_postcode_df = tor_boroughs_drop_na[tor_boroughs_drop_na['Postcode'] == code]
    boroughs.append(temp_postcode_df['Borough'].sort_values().unique()[0])
    neighborhoods.append(', '.join(temp_postcode_df['Neighborhood'].sort_values().unique()))

In [150]:
tor_boroughs_combined = pd.DataFrame({'Postcode': postcodes, 'Borough': boroughs,
                                      'Neighborhood': neighborhoods}) 

In [151]:
print('Shape after combining neighborhoods',tor_boroughs_combined.shape)
tor_boroughs_combined.head(10)

Shape after combining neighborhoods (103, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [152]:
# select all neighborhoods with Not Assigned and replace with value of Borough
tor_boroughs_combined.loc[
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned'].index,'Neighborhood']= \
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned']['Borough']

In [153]:
# Check sample if replaced
tor_boroughs_combined[tor_boroughs_combined['Borough'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighborhood
4,M7A,Queen's Park,Queen's Park


In [154]:
tor_boroughs_combined.shape

(103, 3)