# Segmenting and Clustering Neighborhoods in Toronto

In [11]:
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

## Scraping data from Wikipedia page with BeautifulSoup package and putting results into a Pandas datafram

In [134]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")
#print(soup.prettify())
soup.title.string
right_table=soup.find('table', class_='wikitable sortable')
right_table

A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

        
df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough_Code']=B
df['Neighborhood']=C

## Result of scraping as new Pandas dataframe

In [135]:
df

Unnamed: 0,Postal Code,Borough_Code,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


## Dropping rows where Borough is "Not assigned"
## First checking how many rows have Borough "Not assigned" -- here 77 out of 180

In [138]:
df.groupby(['Borough_Code']).size()

Borough_Code
Central Toronto
      9
Downtown Toronto
    19
East Toronto
         5
East York
            5
Etobicoke
           12
Mississauga
          1
North York
          24
Not assigned
        77
Scarborough
         17
West Toronto
         6
York
                 5
dtype: int64

In [139]:
df = df[~df.Borough_Code.str.contains("Not assigned")]

In [140]:
df

Unnamed: 0,Postal Code,Borough_Code,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


## Use the agg() function from Panda’s dataFrame objects to combine lines with same Postal Code and concatenate the neighborhood string values together

In [141]:
df.groupby(['Postal Code','Borough_Code'])['Neighborhood'].apply(','.join).reset_index()
df

Unnamed: 0,Postal Code,Borough_Code,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


## Checking for any remaining "Not assigned" values in Neighborhood

In [142]:
df.loc[df.Neighborhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough_Code,Neighborhood


In [143]:
df.shape

(103, 3)