# Segmenting and Clustering 

### Neighborhoods in Toronto:

#### Parsing data from Wikipedia page, and creating a dataframe.

#### Import libraries:

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

#### Url for wiki page:

In [2]:
url_w = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#### Using BeautifulSoup to parse the html wiki page, and find table within it:

In [3]:
shtml = requests.get(url_w).text
soup = BeautifulSoup(shtml, 'html.parser')

In [4]:
table = soup.table
soup.find('table')
print("... table data parsed ...")

... table data parsed ...


#### Define "table_rows" and find all "tr" tags:

In [5]:
table_rows = table.find_all('tr')

#### Define row_list list, and use loop to append all the rows into it:

In [6]:
rows_list = []

In [7]:
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    rows_list.append(row)
    print(row)

[]
['M1A', 'Not assigned', 'Not assigned\n']
['M2A', 'Not assigned', 'Not assigned\n']
['M3A', 'North York', 'Parkwoods\n']
['M4A', 'North York', 'Victoria Village\n']
['M5A', 'Downtown Toronto', 'Harbourfront\n']
['M5A', 'Downtown Toronto', 'Regent Park\n']
['M6A', 'North York', 'Lawrence Heights\n']
['M6A', 'North York', 'Lawrence Manor\n']
['M7A', "Queen's Park", 'Not assigned\n']
['M8A', 'Not assigned', 'Not assigned\n']
['M9A', 'Etobicoke', 'Islington Avenue\n']
['M1B', 'Scarborough', 'Rouge\n']
['M1B', 'Scarborough', 'Malvern\n']
['M2B', 'Not assigned', 'Not assigned\n']
['M3B', 'North York', 'Don Mills North\n']
['M4B', 'East York', 'Woodbine Gardens\n']
['M4B', 'East York', 'Parkview Hill\n']
['M5B', 'Downtown Toronto', 'Ryerson\n']
['M5B', 'Downtown Toronto', 'Garden District\n']
['M6B', 'North York', 'Glencairn\n']
['M7B', 'Not assigned', 'Not assigned\n']
['M8B', 'Not assigned', 'Not assigned\n']
['M9B', 'Etobicoke', 'Cloverdale\n']
['M9B', 'Etobicoke', 'Islington\n']
['M9B'

In [8]:
rows_list[0:5]

[[],
 ['M1A', 'Not assigned', 'Not assigned\n'],
 ['M2A', 'Not assigned', 'Not assigned\n'],
 ['M3A', 'North York', 'Parkwoods\n'],
 ['M4A', 'North York', 'Victoria Village\n']]

#### Add list content to dataframe (df_neigh):

In [9]:
df_neigh = pd.DataFrame(rows_list)
df_neigh.head(5)

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


#### Rename the columns with proper names, drop zero first row, and remove all the rows where "Borough"is "Not assined"

In [10]:
df_neigh.columns = ['PostalCode', 'Borough','Neighborhood']
df_neigh.drop(0, inplace = True)
df_neigh.drop(df_neigh.loc[df_neigh['Borough']=='Not assigned'].index, inplace=True)
df_neigh.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n


#### During parsing of html, the new line character (\n) got captured too, so we need to remove it from the values in "Neighborhood" column:

In [11]:
df_neigh['Neighborhood'] = df_neigh['Neighborhood'].map(lambda x: x.rstrip('\n'))
df_neigh.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


#### In next step idea is to group the dataframe accoring to Postal Code, and combine Neighbourhoods under same Borough, separated by ',' character. Also we want to reset the index for our dataset.

In [12]:
df_tor = df_neigh.astype(str).groupby('PostalCode').agg(lambda x: ','.join(x.unique()))
df_tor.reset_index(inplace = True) 
df_tor.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### In order to replace "Not assigned" values in "Neighbourhood", with values from "Borough", the idea is to replace all non assigned values in "Borough " with NaN value. Than we can fill the Nan values, with values from "Neighbourhood" column

In [13]:
df_tor['Neighborhood'].replace("Not assigned", np.nan, inplace = True)
df_tor.Neighborhood.fillna(df_tor.Borough, inplace=True)
df_tor.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Just to keep things safe, we are going to save dataset to .csv file:

In [14]:
df_tor.to_csv('Toronto_PostalCodes.csv')

#### For testing, and observation purposes sorting dataset by 'Neighborhood':

In [15]:
toronto_set = df_tor.sort_values(by=['Neighborhood'])
toronto_set.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
58,M5H,Downtown Toronto,"Adelaide,King,Richmond"
12,M1S,Scarborough,Agincourt
14,M1V,Scarborough,"Agincourt North,L'Amoreaux East,Milliken,Steel..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
89,M8W,Etobicoke,"Alderwood,Long Branch"
28,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights"
19,M2K,North York,Bayview Village
62,M5M,North York,"Bedford Park,Lawrence Manor East"
56,M5E,Downtown Toronto,Berczy Park
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Size of the set:

In [16]:
toronto_set.shape

(103, 3)

## Importing Geospatial data for Neighborhoods:

##### After downloading Geospatial Coordinates data we are going to read it with pandas:

In [17]:
#url = 'https://cocl.us/Geospatial_data'

In [18]:
df_cord = pd.read_csv('Geospatial_Coordinates.csv')
df_cord.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Listing earlier set, for comparison purpouse:

In [19]:
df_tor.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Lets look at the size of two sets:

In [20]:
df_cord.shape

(103, 3)

In [21]:
df_tor.shape

(103, 3)

#### Next we are going to merge two datasets into one:

In [22]:
df_toronto = pd.concat([df_tor, df_cord], axis = 1)

#### Using .head() and .tail() to observe if PostalCode values match after merge:

In [23]:
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",M1N,43.692657,-79.264848


In [24]:
df_toronto.tail(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
93,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar...",M9B,43.650943,-79.554724
95,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B...",M9C,43.643515,-79.577201
96,M9L,North York,Humber Summit,M9L,43.756303,-79.565963
97,M9M,North York,"Emery,Humberlea",M9M,43.724766,-79.532242
98,M9N,York,Weston,M9N,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",M9R,43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",M9V,43.739416,-79.588437
102,M9W,Etobicoke,Northwest,M9W,43.706748,-79.594054


#### Now we can drop "Postal Code" column from our set:

In [25]:
df_toronto.drop('Postal Code', axis=1, inplace = True)
df_toronto.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


#### As a safety measure, we are going to save full set to .csv file:

In [26]:
df_toronto.to_csv('Toronto_FullSet.csv')