## __Segmenting_and_Clustering_Neighborhoods_in_Toronto_Assignment_Notebook2__

## Part 1: Extract data from eikipedia, data wrangling and cleaning

In [11]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Scrape the wikipedia page and extract the Toronto table
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table', attrs= {'class': "wikitable sortable"})
table_row = table.find_all('tr')

res = []
for tr in table_row:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

df = pd.DataFrame(res, columns=["PostalCode", "Borough", "Neighborhood"])

# Find the index of row which Borough = Not assigned and drop those rows
ind = df[df['Borough'] == 'Not assigned'].index
df.drop(ind, axis = 0, inplace = True)

# Combine rows containing same PostalCode with the neighborhoods separated with a comma
df_clean = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

# Clean the column 'Neighborhood' for the case that a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough
x = np.linspace(0,len(df_clean['Neighborhood'])-1, len(df_clean['Neighborhood'])).astype(int)
for n in x:
    if df_clean['Neighborhood'][n] == 'Not assigned':
        df_clean['Neighborhood'][n] = df_clean['Borough'][n]
    else:
        df_clean['Neighborhood'][n] = df_clean['Neighborhood'][n]

# Print the number of rows in the dataframe
print('Total number of rows in the dataframe is', df_clean.shape[0])

Total number of rows in the dataframe is 103


## Part 2: Merging the dataframe based on postal code

#### Change the diretory to access the coordinate file

In [21]:
import os
os. chdir("/Users/kean/Desktop/Capstone_Project")

#### Merge the dataframe by the PostalCode

In [22]:
df_coordinate = pd.read_csv('Geospatial_Coordinates.csv')
df_coordinate['PostalCode'] = df_coordinate['Postal Code']
df_merge = pd.merge(df_clean, df_coordinate, on = ['PostalCode'])
df_merge.drop('Postal Code', axis = 1, inplace = True)
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
