# Segmentation of Toronto city data

### Libraries
Let's start by importing required libraries

In [8]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

With the help of `beautifulsoup4` package let's scrap the following link <a>https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M</a>

In [25]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(link).text
data = BeautifulSoup(extracting_data, 'html.parser')

In [28]:
# To see the scrapped code, just uncomment the following code.

#data

Converting the contents in the table to dataframe.

In [95]:
column_name = ['PostalCode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns = column_name)
content = data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto_df = toronto_df.append({'PostalCode': postcode, 'Borough': borough, 'Neighborhood': neighborhood}, ignore_index = True)

In [96]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df = toronto_df[toronto_df.Borough != 0]
toronto_df.reset_index(drop = True, inplace = True)
i = 0
for i in range(0, toronto_df.shape[0]):
    if toronto_df.iloc[i][2] == 'Not assigned':
        toronto_df.iloc[i][2] = toronto_df.iloc[i][1]
        i = i + 1

Displaying the extracted data in dataframe.

In [97]:
df = toronto_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn
...,...,...,...
175,M9V\n,Etobicoke\n,"South Steeles, Silverstone, Humbergate, Jamest..."
176,M9W\n,Etobicoke\n,"Northwest, West Humber - Clairville"
177,M9X\n,Not assigned\n,Not assigned\n
178,M9Y\n,Not assigned\n,Not assigned\n


In [99]:
df.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,180,180,180
unique,180,11,100
top,M5L\n,Not assigned\n,Not assigned\n
freq,1,77,77


Let's remove the data where the `Borough` value which are `Not Assigned`.

In [100]:
df = df.dropna()
empty = 'Not assigned'
df = df[(df.PostalCode!= empty) & (df.Borough != empty) & (df.Neighborhood != empty)]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn


In [101]:
def neighborhood_list(grouped):
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))

grp = df.groupby(['PostalCode', 'Borough'])
df_1 = grp.apply(neighborhood_list).reset_index(name="Neighborhood")

In [102]:
df_1.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,180,180,180
unique,180,11,100
top,M5L\n,Not assigned\n,Not assigned\n
freq,1,77,77


In [103]:
print(df_1.shape)
df_1.head()

(180, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn


Let's save the extracted data into a csv file and name it as `toronto.csv`

In [104]:
df_1.to_csv('toronto.csv', index=False)