# Toronto Postal Codes

### Import Libraries

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geocoder --yes
import geocoder

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#!conda install -c conda-forge folium --yes
import folium

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries loaded.')

## Question #1

### 1.1 Retrieve data

In [None]:
# get data from the Wikipedia page
list_of_dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

toronto_df = list_of_dfs[0]
#toronto_df = toronto_df.rename(columns={'Postal Code' : 'PostalCode'})
toronto_df.head()

### 1.2 Remove Not Assigned Boroughs

In [None]:
toronto_df.drop(toronto_df[toronto_df['Borough'] == 'Not assigned'].index, inplace = True) 
toronto_df.reset_index(inplace=True, drop=True)
toronto_df.head()

### 1.3 Check Not Assigned Neighbourhood

In [None]:
toronto_df[toronto_df['Neighbourhood'] == 'Not assigned']

There is NO **Not Assigned** Neighbourhood

### 1.4 Final result. Dataframe Shape

In [None]:
toronto_df.shape

The dataframe hase **103 rows** and **3 columns**

## Question #2

### Populate the Latitude and Longitude of postal code dataframe

In [None]:
def get_location_remote(row) :
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(row['Postal Code']))
        lat_lng_coords = g.latlng

    row['Latitude'] = lat_lng_coords[0]
    row['Longitude'] = lat_lng_coords[1]
    
    return row

# Use CSV
toronto_geo = pd.read_csv('https://cocl.us/Geospatial_data')
toronto_df = toronto_df.merge(toronto_geo, on='Postal Code')
toronto_df.head()

# User geocoder
#toronto_df['Latitude', 'Longitude'] = 
#toronto_df = toronto_df.apply(get_location_remote, axis=1)
#toronto_df.head()

## Question #3

Quickly examine the resulting dataframe.

In [None]:
toronto_df.head()

In [None]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

#### Use geopy library to get the latitude and longitude values of Toronto City.


In [None]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

#### Create a map of Toronto with neighborhoods superimposed on top.


In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.


#### Define Foursquare Credentials and Version


In [None]:
CLIENT_ID = 'Y0P44AF55JQIYQ5Q24KAQ0X4MQRM0LCUAPSZ2OQ4MKL2A1GA' # your Foursquare ID
CLIENT_SECRET = 'BW5TXHHN1QJNA5T3EHJTOPV2GDDBJAVG1WAGAL2MWR0WB01S' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto


In [None]:
def getNearbyVenues(post_codes, boroughs, neighbourhoods, latitudes, longitudes, radius=500):
    venues_list=[]
    for post_code, borough, neighborhood, lat, lng in zip(post_codes, boroughs, neighbourhoods, latitudes, longitudes):
        print(post_code)

        # create the API request URL and params
        url = 'https://api.foursquare.com/v2/venues/explore'    
        params = dict(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        v=VERSION,
        ll='{}, {}'.format(lat, lng),
        limit=LIMIT,
        radius=radius
        )
        
            
        # make the GET request
        results = requests.get(url=url, params=params).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            post_code, 
            borough,
            neighborhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Borough',
                  'Neighbourhood',
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# type your answer here
toronto_venues = getNearbyVenues(toronto_df['Postal Code'], toronto_df['Borough'], toronto_df['Neighbourhood'], toronto_df['Latitude'], toronto_df['Longitude'], radius=2000)


#### Let's check the size of the resulting dataframe


In [None]:
print(toronto_venues.shape)
toronto_venues.head()

Let's check how many venues were returned for each neighborhood


In [None]:
toronto_venues.groupby('Postal Code').count()

#### Let's find out how many unique categories can be curated from all the returned venues


In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyze Each Neighborhood


In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

And let's examine the new dataframe size.

In [None]:
toronto_onehot.shape

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [None]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped

#### Let's confirm the new size

In [None]:
toronto_grouped.shape

#### Let's print each neighborhood along with the top 5 most common venues


In [None]:
num_top_venues = 5

for hood in toronto_grouped['Postal Code']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Postal Code'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Let's put that into a _pandas_ dataframe
First, let's write a function to sort the venues in descending order.



In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.


In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_venues_sorted.head()

### Cluster Neighborhoods


Run _k_-means to cluster the neighborhood into 5 clusters.


In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.


In [None]:
# add clustering labels
toronto_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_merged with toronto_df to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_venues_sorted.set_index('Postal Code'), on='Postal Code')

toronto_merged.head(10) # check the last columns!

Finally, let's visualize the resulting clusters


In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.


#### Cluster 1


In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 2


In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 3


In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 4


In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 5

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]