# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

#### Getting import packages and functions

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#### Here we download the wikipedia data set using pandas

In [9]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [10]:
data = pd.read_html(url)

#### And convert it to a pandas data frame

In [11]:
t_df = pd.DataFrame(data[0])
t_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Here we eliminate all rows with a "Not assigned" Burough

In [45]:
df = t_df[t_df.Borough != "Not assigned"]
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### And here we see that there are no cases of a row where the Burough is named but the Neighborhood is not

In [46]:
df[df.Neighborhood == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


#### Now we check our shape...

In [47]:
df.shape

(103, 3)

#### Our resulting data set has 3 columns and 103 observations!

## Part 2

#### importing geocoder to get coordinates

In [48]:
#!conda install -c conda-forge geocoder=1.38.1 --yes
#import geocoder # import geocoder

#### set up empty Latitude and Longitude columns in df

In [49]:
df["Latitude"] = np.NaN
df["Longitude"] = np.NaN
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


#### find the coordinates for each Postal Code

In [50]:
#for i in df["Postal Code"]:
    # initialize your variable to None
    #lat_lng_coords = None

    # loop until you get the coordinates
    #while(lat_lng_coords is None):
      #g = geocoder.google('{}, Toronto, Ontario'.format(i))
      #lat_lng_coords = g.latlng
        
    #latitude = lat_lng_coords[0]
    #longitude = lat_lng_coords[1]
    
    #df.loc[df["Postal Code"]==i,"Latitude"] = latitude
    #df.loc[df["Postal Code"]==i,"Longitude"] = longitude

#### geocoder not working (never finishes running, even on a single area code) so we dowload the csv

In [51]:
#geo_data = pd.read_csv("Geospatial_Coordinates.csv")
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_c3184cd62a4c4b3a920f9b334b501109 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='idJWCKU3OzH7JulMLNlXo4IF5cXmOJOlxKqEWJYPLZis',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_c3184cd62a4c4b3a920f9b334b501109.get_object(Bucket='courseracapstoneapplieddatascienc-donotdelete-pr-whqbcsawpi7ucd',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

geo_data = pd.read_csv(body)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [52]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


#### We drop the Latitude and Longitude from our df and merge the df and geo_data by the postal code.

In [53]:
df = df.drop(['Latitude', 'Longitude'], axis=1)
df_final = pd.merge(df, geo_data, on='Postal Code')

In [54]:
df_final[df_final["Postal Code"]=="M5G"]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


#### and we have recreate the section 2 dataframe