# Beginning of Submission 1

### Import required libraries

In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes
from geopy.geocoders import Nominatim
import folium

### Import libraries for scraping with BeautifulSoup

In [1]:
from bs4 import BeautifulSoup 
import requests
import urllib.request
from urllib.request import urlopen
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
soup = BeautifulSoup(urlopen(url), 'html.parser')
#read pages and pages of documentation to realize one just needs to extract the data by their tags
#soup 

In [3]:
#Grab all the headers of the tables by searching for the table tag
table_headers = soup.find_all('table')[0].find('tr') 
header_list = []
for header in table_headers: 
    try:
        header_list.append(header.get_text()) 
    except: 
        continue

#Then grab everything after the headers, aka the data
data = soup.find_all('table')[0].find_all('tr')[1:] 
data_list = []
for i in data: 
    sub_data = [] 
    for sub_i in i: 
        try:
            sub_data.append(sub_i.get_text()) 
        except: 
            continue
    data_list.append(sub_data) 

#Strip the newline characters from the lists
header_new = [] 
for j in header_list:
    header_new.append(j.replace('\n',''))
    
data_new = []
data_new = [[spot.replace('\n','') for spot in rows] for rows in data_list]

#convert to csv
df = pd.DataFrame(data_new, columns = header_new) 
df.to_csv('neighborhoods.csv') 
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Preprocessing before we move onto geolocation

In [10]:
#Drop the cells that don't have an assigned borough
indexUnassigned = df[df['Borough'] == 'Not assigned'].index
df.drop(indexUnassigned,axis=0,inplace=True)

# Combining neighborhoods with shared postal codes ONLY WORKS ONCE PER RUN AND WILL ERROR 
df_combine = df.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df_combine.reset_index(inplace=True)

# Replace'not assigned' neightborhoods with the names of their borough
df_combine['Neighbourhood'] = np.where(df_combine['Neighbourhood'] == 'Not assigned',
                                       df_combine['Borough'], df_combine['Neighbourhood'])

df = df_combine
df
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,<class 'float'>,<class 'float'>
1,M4A,North York,Victoria Village,<class 'float'>,<class 'float'>
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",<class 'float'>,<class 'float'>
3,M6A,North York,"Lawrence Manor, Lawrence Heights",<class 'float'>,<class 'float'>
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",<class 'float'>,<class 'float'>
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",<class 'float'>,<class 'float'>
6,M1B,Scarborough,"Malvern, Rouge",<class 'float'>,<class 'float'>
7,M3B,North York,Don Mills,<class 'float'>,<class 'float'>
8,M4B,East York,"Parkview Hill, Woodbine Gardens",<class 'float'>,<class 'float'>
9,M5B,Downtown Toronto,"Garden District, Ryerson",<class 'float'>,<class 'float'>


In [7]:
df.shape

(103, 3)

# End of Submission 1

# Start of Submission 2

In [11]:
#Instantiate geolocator
#!pip install pgeocode
pd.options.mode.chained_assignment = None
import pgeocode
geolocator = pgeocode.Nominatim('ca')

#Cycle through the postal codes so the geolocator can find the lats and longs
df['Latitude'] = pd.Series(float, index=df.index)
df['Longitude'] = pd.Series(float, index=df.index)
for index in range(0,len(df)):
    location = geolocator.query_postal_code(df.iloc[index]['Postal Code'])
    #print(location.latitude)
    #print(location.longitude)
    df.iloc[index]['Latitude'] = location.latitude
    df.iloc[index]['Longitude'] = location.longitude
  


In [12]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783


# End of Submission 2

# Start of Submission 3

### Per the instructions, only work with the boroughs that contain Toronto in them

In [13]:
df_toronto = df[df['Borough'].str.contains('Toronto')]
df_toronto.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,M4E,East Toronto,The Beaches,43.6784,-79.2941
5,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754
6,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386
7,M6G,Downtown Toronto,Christie,43.6683,-79.4205
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6655,-79.4378


### Move forward with Folium to map the neighborhoods pre-cluster

In [14]:
map_pre = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

#Make and points and label them by neighborhood and borough
for lat, long, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], 
    df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, long],
    radius=5,
    popup=label,
    color='green',
    fill=True,
    fill_color='#5556bc',
    fill_opacity=0.5,
    parse_html=False).add_to(map_pre)
map_pre

### Run KMeans clustering to form 5 neighborhood clusters.

In [15]:
k = 5
clusters = df_toronto.drop(['Postal Code', 'Borough', 'Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(clusters)
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
15,0,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
19,4,M4E,East Toronto,The Beaches,43.6784,-79.2941
20,0,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754
24,0,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386
25,3,M6G,Downtown Toronto,Christie,43.6683,-79.4205
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833
31,3,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6655,-79.4378


In [17]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, long, neighborhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'],
    df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# End of Submission 3