# Segmenting and Clustering Neighborhoods in Toronto
## Week 3 Peer Graded Assignment

In [80]:
import pandas as pd
import html5lib
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 

### Read Given Wiki Page, Search For all Tables and condition to get the table with our required fields

In [20]:
tables= pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
headings=['Postcode','Borough','Neighbourhood']
for table in tables:
    current_headings = table.columns.values[:4]
    if len(current_headings) != len(headings):
        continue
    if all(current_headings == headings):
        break
table.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Convert the table to csv and Call it to a data frame

In [21]:
table[headings].to_csv('data.csv', sep=',', header=True, index=False)

In [22]:
df=pd.read_csv('data.csv')
df.shape

(288, 3)

In [23]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Removing rows with value of Borough = 'Not assigned'

In [24]:
df = df[df.Borough != 'Not assigned']

In [25]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Merging rows with same Postcode

In [26]:
aggregation_functions = {'Borough': 'first', 'Neighbourhood': ','.join}
df2 = df.groupby(df['Postcode']).aggregate(aggregation_functions).reset_index()

In [27]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Copying Values 'Borough' to 'Neighbourhood',for rows which having 'Neighbourhood'=='Not Assigned'

In [28]:
df2.loc[df2['Neighbourhood']=='Not assigned','Neighbourhood']=df2.Borough

#### M7A was the only one remaining 'Not assigned' Neighbourhood column. Checking it's current State

In [29]:
df2[df2.Postcode == 'M7A']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [30]:
df2.shape

(103, 3)

In [31]:
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Adding Geo Location to the Existing Data Frame

In [32]:
geo_loc=pd.read_csv('https://cocl.us/Geospatial_data')

In [33]:
geo_loc=geo_loc.rename(columns={'Postal Code':'Postcode'})

In [73]:
df_geo=pd.merge(left=df2,right=geo_loc, on='Postcode')
temp_df=df_geo
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<p>As K-Means Algorithm Cannot be done in Categorical Values, We have to drop all categorical Values</p>

In [74]:
temp_df.drop(['Postcode','Borough','Neighbourhood'],axis=1,inplace=True)

In [75]:
temp_df.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


In [78]:

X = temp_df.values[:,1:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[ 2.09777597],
       [ 2.44798852],
       [ 2.15613628],
       [ 1.86437197],
       [ 1.6310228 ],
       [ 1.6310228 ],
       [ 1.39772948],
       [ 1.16449306],
       [ 1.6310228 ],
       [ 1.3685726 ],
       [ 1.28110403],
       [ 1.04789553],
       [ 1.39772948],
       [ 0.96045696],
       [ 1.16449306],
       [ 0.81474393],
       [ 1.98106673],
       [ 0.3486083 ],
       [ 0.523382  ],
       [ 0.1156253 ],
       [ 0.23210904],
       [-0.1173008 ],
       [-0.1173008 ],
       [-0.0299605 ],
       [-0.46658445],
       [ 0.69818881],
       [ 0.465121  ],
       [ 0.58164715],
       [-0.46658445],
       [-0.9320953 ],
       [-0.6993678 ],
       [-1.13568453],
       [-1.01935285],
       [-1.28107896],
       [ 0.84388426],
       [ 0.90216906],
       [ 0.81474393],
       [ 1.07704414],
       [ 0.3486083 ],
       [ 0.49425098],
       [ 0.61078127],
       [ 0.465121  ],
       [ 0.84388426],
       [ 0.58164715],
       [ 0.08650566],
       [ 0

In [81]:
num_clusters = 3

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 2 2 2
 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [82]:
df_geo["labels"]=labels

In [84]:
df_geo

Unnamed: 0,Latitude,Longitude,labels
0,43.806686,-79.194353,2
1,43.784535,-79.160497,2
2,43.763573,-79.188711,2
3,43.770992,-79.216917,2
4,43.773136,-79.239476,2
5,43.744734,-79.239476,2
6,43.727929,-79.262029,2
7,43.711112,-79.284577,2
8,43.716316,-79.239476,2
9,43.692657,-79.264848,2


In [86]:
df_geo.groupby('labels').mean()

Unnamed: 0_level_0,Latitude,Longitude
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43.687098,-79.519158
1,43.695659,-79.389651
2,43.746157,-79.265587
