# Segmentation And Clustering

## Import libraries

In [64]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen

## Scrape the data from wikipedia

In [66]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
table_row = soup.find("table",{"class":"wikitable"}).find("tbody").find_all('tr')
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
table_row[0].find_all("th")[0].text.strip()
for i in range(1, len(table_row)):
    postalCode = table_row[i].find_all("td")[0].text.strip()
    borough = table_row[i].find_all("td")[1].text.strip()
    neighborhood = table_row[i].find_all("td")[2].text.strip()
    row = {'PostalCode' : postalCode, 'Borough': borough, 'Neighborhood': neighborhood}
    df = df.append(row , ignore_index=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [67]:
#Remove not assigned rows
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [86]:
#Group the rows by postal code
data = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x.astype(str))).reset_index()
location_data = pd.DataFrame(data)
location_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [88]:
df.shape

(211, 3)

# PART 2

In [93]:
#IMPORT Geospatial coordinates daataframe
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [72]:
geospatial_data.shape

(103, 3)

In [91]:
#Concatenate both the dataframes
final_dataframe = pd.concat([location_data, geospatial_data], axis=1)
final_dataframe.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [92]:
final_dataframe.shape

(103, 6)