# Assignment: Segmenting and Clustering Neighborhoods in Toronto

# 1. Install required libraries

In [2]:
!pip install BeautifulSoup4
!pip install lxml
!pip install tabulate


Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 8.9MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.2 soupsieve-1.9.5
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 9.5MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Collecting tabulate
[?25l  Downloading https://files.pythonhos

# Import required libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


# Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [4]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))


# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
df2=df[0][df[0].Borough != 'Not assigned']


# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [6]:
df2.columns = ['PostalCode', 'Borough', 'Neighbourhood']


# Combine into one row with the neighborhoods separated with a comma

In [7]:
df2 = df2.groupby('PostalCode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join
                             }).reset_index()


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:
df2['Neighbourhood'] = [row[-2] if row[-1]=='Not assigned' else row[-1] for row in df2.itertuples()]


# Print the dataframe

In [9]:
print(tabulate(df2, headers='keys', tablefmt='psql') )


+-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|     | PostalCode   | Borough          | Neighbourhood                                                                                                                          |
|-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------|
|   0 | M1B          | Scarborough      | Rouge, Malvern                                                                                                                         |
|   1 | M1C          | Scarborough      | Highland Creek, Rouge Hill, Port Union                                                                                                 |
|   2 | M1E          | Scarborough      | Guildwood, Morningside, West Hill                              

# Use the .shape method to print the number of rows

In [21]:
df2.shape

(103, 5)

# 2. Use the the csv file to create the dataframe with Latitude and Longitude

In [22]:
df3 = pd.read_csv("http://cocl.us/Geospatial_data")
df2['Latitude'] = df3['Latitude'].values
df2['Longitude'] = df3['Longitude'].values
print(tabulate(df2, headers='keys',floatfmt=".6f") )


     PostalCode    Borough           Neighbourhood                                                                                                                             Latitude    Longitude
---  ------------  ----------------  --------------------------------------------------------------------------------------------------------------------------------------  ----------  -----------
  0  M1B           Scarborough       Rouge, Malvern                                                                                                                           43.806686   -79.194353
  1  M1C           Scarborough       Highland Creek, Rouge Hill, Port Union                                                                                                   43.784535   -79.160497
  2  M1E           Scarborough       Guildwood, Morningside, West Hill                                                                                                        43.763573   -79.188711
  3  M1G       