# Assignment: Segmenting and Clustering Neighborhoods in Toronto

# Install required libraries

In [None]:
!pip install BeautifulSoup4
!pip install lxml
!pip install tabulate


# Import required libraries

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


# Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [6]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))


# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [7]:
df2=df[0][df[0].Borough != 'Not assigned']


# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [8]:
df2.columns = ['PostalCode', 'Borough', 'Neighbourhood']


# Combine into one row with the neighborhoods separated with a comma

In [9]:
df2 = df2.groupby('PostalCode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join
                             }).reset_index()


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [10]:
df2['Neighbourhood'] = [row[-2] if row[-1]=='Not assigned' else row[-1] for row in df2.itertuples()]


# Print the dataframe

In [11]:
print(tabulate(df2, headers='keys', tablefmt='psql') )


+-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|     | PostalCode   | Borough          | Neighbourhood                                                                                                                          |
|-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------|
|   0 | M1B          | Scarborough      | Rouge, Malvern                                                                                                                         |
|   1 | M1C          | Scarborough      | Highland Creek, Rouge Hill, Port Union                                                                                                 |
|   2 | M1E          | Scarborough      | Guildwood, Morningside, West Hill                              

# Use the .shape method to print the number of rows

In [12]:
df2.shape

(103, 3)