# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Firstly, import the libraries

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

### scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [None]:
from bs4 import BeautifulSoup
import urllib.request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

with open('ISO_3166-1_alpha-2.html', 'w') as fo:
    fo.write(article)
    
soup = BeautifulSoup(article, 'html.parser')

### obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe df

In [None]:
tables = soup.find_all('table', class_='sortable')
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break

In [None]:
df = pd.DataFrame(columns=headings)
i=0

for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighbourhood = [td.text.strip() for td in tds[:3]]
        
        # Ignore cells with a borough that is Not assigned.
        if 'Not assigned' in borough:
            continue
        # If a cell has a borough but a Not assigned neighborhood, 
        # then the neighborhood will be the same as the borough. 
        if 'Not assigned' in neighbourhood:
            neighbourhood = borough
        df.loc[i] = [postcode, borough, neighbourhood]
        i=i+1

In [None]:
from functools import reduce

def to_set(x):
    setx = set(x)
    strs = ', '.join(setx)   
    return strs

#grouped = df.groupby("Postcode").agg({'Neighbourhood': to_set})

grouped = df.groupby("Postcode").agg({'Borough':'first', 'Neighbourhood': to_set})

df2 = df.drop(columns='Neighbourhood')
grouped = grouped.reset_index()

#grouped = pd.merge(df2, grouped, how='inner', on='Postcode')

    
#grouped = pd.DataFrame({'Qty_cnt' : df['Borough'], grouped]

grouped[['Postcode','Borough','Neighbourhood']]

### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [None]:
grouped.shape

### Part 2:  get the latitude and the longitude coordinates of each neighborhood.

In [None]:
#!conda install -c conda-forge geocoder  #install geocoder for the 1st run
import geocoder # import geocoder

In [None]:
# Comment:
# I tried to use geocoder, but the API seems not working, 
# so I choose to use the csv file instead
#
#nbh = 'Malvern, Rouge'
#postal_code = 'M1B'
#g = geocoder.google(nbh)
#lat_lng_coords = g.latlng

### Import lat_lng_coords CSV, and show the complete dataframe

In [None]:
df_ll = pd.read_csv("https://cocl.us/Geospatial_data")

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]
g2 = grouped[['Postcode','Borough','Neighbourhood']]
d2 = df_ll.rename(index=str, columns={"Postal Code": "Postcode"})
g2 = pd.merge(d2, g2, how='inner', on='Postcode')

g2 = g2[['Postcode','Borough','Neighbourhood', 'Latitude', 'Longitude']]
g2
