### Preliminary stuff from Q1 and Q2

In [1]:
import pandas as pd
import numpy as np

Read data from HTML

In [2]:
df_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

Select the first table

In [3]:
df = df_list[0]

3 Create the shown dataframe

In [4]:
df = df[df['Borough'] != 'Not assigned']

In [5]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [6]:
df['Neighborhood'] = df['Neighborhood'].replace('Not assigned', np.NAN).fillna(value=df['Borough'])

In [7]:
postcode_list = ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']
index_list = np.array([])
for postcode in postcode_list:
    index_list = np.append(index_list, df.query('Postcode == "' + postcode + '"').index[0])

### Q2

Load data from csv

In [8]:
df_ll = pd.read_csv('https://cocl.us/Geospatial_data')

Merge two datasets 

In [9]:
df.rename(columns={'Postcode':'Postal Code'}, inplace=True)

In [10]:
df = df.merge(df_ll, how='left', on='Postal Code')

### Q3

Select data from Toronto only

In [11]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [12]:
Toronto_list = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [13]:
df = df.query('Borough in ' + str(Toronto_list))

Show the Toronto map

In [14]:
import folium

In [15]:
avg_latitude  = df['Latitude'].mean()
avg_longitude = df['Longitude'].mean()

In [16]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=11.5)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Perform clustering

In [17]:
from sklearn.cluster import KMeans

In [18]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [19]:
# set number of clusters
kclusters = 4

latlon = df[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(latlon)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
      dtype=int32)

Show how they cluster

In [20]:
# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)

In [21]:
# create map
map_clusters = folium.Map(location=[avg_latitude, avg_longitude], zoom_start=11.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters