# Clustering and segmentation for neighborhoods in Toronto

### Needed libraries for the assignment:

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Getting the table from the Wiki page:

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website = requests.get(url).text


soup = BeautifulSoup(website,'lxml')


# Getting the table
tablePostalCodes = soup.find('table',{'class':'wikitable sortable'})

# Getting a list with each row of the table
detailPostalCodes = tablePostalCodes.find_all('tr')

# Deleting the row with the heading of the table
del detailPostalCodes[0]




### Building a summarized list of lists of Postal codes of Canada

In [3]:
#Creating an empty list
summarizedCodesCanada = []

#Initializing some temporal variables
tempPostcode = None
tempBorough = None
tempNeighbourhood = None

#Reading each row from detailPostalCodes and building the summarizedCodesCanada list.
for postalCode in detailPostalCodes:
    detailPostalCode = postalCode.find_all('td')
    postcode = detailPostalCode[0].text
    borough = detailPostalCode[1].text
    neighbourhood = detailPostalCode[2].text.rstrip()
    #Only taking into account the rows with a borough
    if borough != 'Not assigned':
        #Assigning the borough to the neighbourhood when there is no neighbourhood.
        if neighbourhood == 'Not assigned':
            neighbourhood = borough
        if tempPostcode != postcode:
            if tempPostcode is not None:
                #Appending the summarized row to the summarized list
                summarizedCodesCanada.append([tempPostcode, tempBorough, tempNeighbourhood])
            #Temporaly storing the current row 
            tempPostcode = postcode
            tempBorough = borough
            tempNeighbourhood = neighbourhood
        else:
            #When there are many neigbourhoods for the same postcode, all neigbourhoods are grouped in the same cell 
            tempNeighbourhood = tempNeighbourhood + ', ' + neighbourhood
        
#Appending the last summarized row 
summarizedCodesCanada.append([tempPostcode, tempBorough, tempNeighbourhood])


### Building the dataframe from the list

In [12]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']

#Creating the dataframe from the list
dfSummarizedCodesCanada = pd.DataFrame(summarizedCodesCanada, columns = column_names)

# Print first 5 rows
dfSummarizedCodesCanada.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [13]:
# Print the number of rows of the dataframe
print(dfSummarizedCodesCanada.shape)


(103, 3)
