# Clustering Toronto

### Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Wikipedia API endpoint

In [2]:
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&formatversion=2&titles=List_of_postal_codes_of_Canada:_M'

In [3]:
#get request
results = requests.get(url).json()

In [4]:
content = results["query"]["pages"][0]["revisions"][0]["content"].split("class=\"wikitable sortable\"\n! ")[1].split("{{col-begin}}")[0]

### Clean this mess of data

In [5]:
rows = content.split("\n")

In [6]:
newrows = []
for row in rows:
    if row != ('|-'):
        newrows.append(row.split("||"))


#### Building lists 

In [7]:
borough = []
neighborhood = []
code = []

for i in range(1,289):
    code.append(newrows[i][0].split(' ')[1])
    borough.append(newrows[i][1])
    neighborhood.append(newrows[i][2])
    

#### Building a DataFrame

In [8]:
df = pd.DataFrame({'PostalCode':code,'Borough':borough,'Neighborhood':neighborhood})

#### Cleaning all this weirdness

In [9]:
cleanDf = df.applymap(lambda x: x.strip() if isinstance(x,str) else x)

In [10]:
cleanDf['Borough'] = cleanDf['Borough'].str.strip('[[]]').astype(str)

In [11]:
cleanDf['Neighborhood'] = cleanDf['Neighborhood'].str.strip('[[]]').astype(str)

In [12]:
for i in range(len(cleanDf)):
    cleanDf['Borough'][i] = cleanDf['Borough'][i].replace("|",", ")
    cleanDf['Neighborhood'][i] = cleanDf['Neighborhood'][i].replace("|",", ")

#### Taking a second to check

In [13]:
cleanDf.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront (Toronto), Harbourfront"


### Group by Postalcode set Neighborhoods value to Borough value where Neighborhood is N/A

In [49]:
newDf = cleanDf.groupby('PostalCode').agg({'Borough':'first','Neighborhood': ', '.join}).reset_index()

In [50]:
newDf = newDf.applymap(lambda x: x.strip() if isinstance(x,str) else x)

In [51]:
for i in range (0,len(newDf)):
    if newDf['Neighborhood'][i] == 'Not assigned':
        newDf.at[i,'Neighborhood'] = newDf['Borough'][i]

In [52]:
newDf

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M1B,"Scarborough, Toronto, Scarborough","Rouge, Toronto, Rouge, Malvern, Toronto, Malvern"
2,M1C,"Scarborough, Toronto, Scarborough","Highland Creek (Toronto), Highland Creek, Roug..."
3,M1E,"Scarborough, Toronto, Scarborough","Guildwood, Morningside, Toronto, Morningside, ..."
4,M1G,"Scarborough, Toronto, Scarborough","Woburn, Toronto, Woburn"
5,M1H,"Scarborough, Toronto, Scarborough",Cedarbrae
6,M1J,"Scarborough, Toronto, Scarborough",Scarborough Village
7,M1K,"Scarborough, Toronto, Scarborough","East Birchmount Park, Ionview, Kennedy Park, T..."
8,M1L,"Scarborough, Toronto, Scarborough","Clairlea, Golden Mile, Toronto, Golden Mile, O..."
9,M1M,"Scarborough, Toronto, Scarborough","Cliffcrest, Cliffside, Toronto, Cliffside, Sca..."


In [53]:
newDf.shape

(180, 3)

In [54]:
print('Total number of rows: ',newDf.shape[0])

Total number of rows:  180


## Import Geospatial Data

In [55]:
geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [60]:
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Geospatial data and Dataframe

In [57]:
df3 = pd.merge(newDf,geo,left_on='PostalCode',right_on='Postal Code', how='left')

In [58]:
df3.drop(columns='Postal Code',inplace=True)

In [59]:
df3

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1A,Not assigned,Not assigned,,
1,M1B,"Scarborough, Toronto, Scarborough","Rouge, Toronto, Rouge, Malvern, Toronto, Malvern",43.806686,-79.194353
2,M1C,"Scarborough, Toronto, Scarborough","Highland Creek (Toronto), Highland Creek, Roug...",43.784535,-79.160497
3,M1E,"Scarborough, Toronto, Scarborough","Guildwood, Morningside, Toronto, Morningside, ...",43.763573,-79.188711
4,M1G,"Scarborough, Toronto, Scarborough","Woburn, Toronto, Woburn",43.770992,-79.216917
5,M1H,"Scarborough, Toronto, Scarborough",Cedarbrae,43.773136,-79.239476
6,M1J,"Scarborough, Toronto, Scarborough",Scarborough Village,43.744734,-79.239476
7,M1K,"Scarborough, Toronto, Scarborough","East Birchmount Park, Ionview, Kennedy Park, T...",43.727929,-79.262029
8,M1L,"Scarborough, Toronto, Scarborough","Clairlea, Golden Mile, Toronto, Golden Mile, O...",43.711112,-79.284577
9,M1M,"Scarborough, Toronto, Scarborough","Cliffcrest, Cliffside, Toronto, Cliffside, Sca...",43.716316,-79.239476
