# Segmenting and Clustering Neighborhoods in Toronto (part 2)

**Applied Data Science Capstone (week 3)**

In [72]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

The Wikipedia page will be obtained and shown.

In [73]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page=requests.get(wikipedia_link) 

The dataframe will be created.
- Only the cells that have an assigned bourough will be processed.
- If a cell has a borough but a Not-assigned neighborhood, the neighborhood will be the same as the borough. 

In [74]:
# define the dataframe columns
column_names = ['PostalCode','Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

bs = BeautifulSoup(wikipedia_page.text, 'html.parser')

#Initialization of a few parameters.
i=1

#Preparation of the dataframe
for j in bs.select("td"):

    if i%3==1:
        PCode=j.getText()
    elif i%3==2:
        Bourough=j.getText()
    else:
        Neighborhood=j.getText() 
        omnum=Neighborhood.rfind('\n')
        Neighborhood=Neighborhood[0:omnum] #Omitting \n.
        
    if i<=867:
        if i%3==0 and Bourough!='Not assigned':# Ignore cells with a borough that is Not assigned
            if (i%3==0) and (Neighborhood=='Not assigned'): # If neighborhood is not assigned, the neighborhood will be the same as the borough.
                df = df.append({'PostalCode': PCode,'Borough': Bourough,'Neighborhood': Bourough}, ignore_index=True) 
            else:
                df = df.append({'PostalCode': PCode,'Borough': Bourough,'Neighborhood': Neighborhood}, ignore_index=True)
    else:
        break
       
    i=i+1

A new dataframe will be created.
- More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.
- Latitude and Longitude columns will be added. 

In [75]:
# instantiate the dataframe
df_new = pd.DataFrame(columns=column_names)
PostalCodeList=df['PostalCode'].value_counts().index
df_new['PostalCode']=PostalCodeList

##
num_df=df.shape[0]
num_PostalCode=df['PostalCode'].value_counts().shape[0]

for j1 in np.linspace(0,num_PostalCode-1,num_PostalCode):
    num=0
    for j2 in np.linspace(0,num_df-1,num_df):
        if df['PostalCode'][j2]==df_new['PostalCode'][j1]: 
            df_new['Borough'][j1]=df['Borough'][j2]
            if num==0:
                df_new['Neighborhood'][j1]=df['Neighborhood'][j2]
            else:
                df_new['Neighborhood'][j1]=df['Neighborhood'][j2] + ', ' + df_new['Neighborhood'][j1]
            num=num+1

# Adding new columns
df_new['Latitude']='0'
df_new['Longitude']='0'

Latitude and Longitude columns will be filled out.

In [77]:
CurrentDirectory=os.getcwd()
CSVdata_path=CurrentDirectory + "/Geospatial_Coordinates.csv"
csv_input = pd.read_csv(filepath_or_buffer=CSVdata_path)

for j1 in np.linspace(0,num_PostalCode-1,num_PostalCode):
    for j2 in np.linspace(0,num_PostalCode-1,num_PostalCode):
        if csv_input['Postal Code'][j2]==df_new['PostalCode'][j1]:
            df_new['Latitude'][j1]=csv_input['Latitude'][j2]
            df_new['Longitude'][j1]=csv_input['Longitude'][j2]

            
df_new.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M8Y,Etobicoke,"Sunnylea, Royal York South East, The Queensway...",43.6363,-79.4985
1,M9V,Etobicoke,"Thistletown, South Steeles, Silverstone, Mount...",43.7394,-79.5884
2,M5V,Downtown Toronto,"South Niagara, Railway Lands, King and Spadina...",43.6289,-79.3944
3,M4V,Central Toronto,"Summerhill West, South Hill, Rathnelly, Forest...",43.6864,-79.4
4,M8Z,Etobicoke,"South of Bloor, Royal York South West, The Que...",43.6288,-79.521
5,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.6509,-79.5547
6,M6M,York,"Silverthorn, Mount Dennis, Keelesdale, Del Ray",43.6911,-79.476
7,M9C,Etobicoke,"Old Burnhamthorpe, Markland Wood, Eringate, Bl...",43.6435,-79.5772
8,M9R,Etobicoke,"St. Phillips, Richview Gardens, Martin Grove G...",43.6889,-79.5547
9,M1V,Scarborough,"Steeles East, Milliken, L'Amoreaux East, Aginc...",43.8153,-79.2846
