# Segmenting and Clustering Neighborhoods in Toronto Part 1

### Import libraries

In [278]:
import pandas as pd
import numpy as np
import requests

### Scrapping the wikipedia data

In [279]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_page = requests.get(wiki_url)
wiki_data = pd.read_html(wiki_page.text)
wiki_data

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

In [280]:
len(wiki_data)

3

In [281]:
type(wiki_data)

list

### Obtain dataframe with three columns

In [282]:
#only it is neccesary the first table so we can drop the other tables

wiki_df = wiki_data[0]
wiki_df.head()
wiki_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [283]:
list(wiki_df.columns.values)

['Postal Code', 'Borough', 'Neighbourhood']

### Drop the Borough and Neighbourhood which are not assigned

In [284]:
df = wiki_df[wiki_df['Neighbourhood'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [285]:
df.Neighbourhood.str.count("Not assigned").sum()

0

### Group by postal codes, reset the index values and drop the old index

In [286]:
# Group by Postal Code

df = df.sort_values(['Postal Code'])
df



Unnamed: 0,Postal Code,Borough,Neighbourhood
9,M1B,Scarborough,"Malvern, Rouge"
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
36,M1G,Scarborough,Woburn
45,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
107,M9P,Etobicoke,Westmount
116,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
143,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [287]:
#Reset the index values and drop the old index
df.reset_index(inplace = True)
df.drop(['index'], axis = 'columns', inplace = True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### Obtain the shape of dataframe

In [288]:
df.shape

(103, 3)