# Peer Graded Assignment   
## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Import necessary modules
import requests 
import lxml.html as lh
from bs4 import BeautifulSoup
import pandas as pd

## Extract table from website using BeautifulSoup and cast content to DataFrame.  
### Comments are annotated in body of code how dataframe is cleaned and formatted in line with assignment instructions.

In [2]:
# Extract table content from webscraping url using BeautifulSoup:
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(website_url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 

# Fetch table content into a dataframe:
df = pd.read_html(str(table))[0]
# Format the dataframe with the required column headings and reindex the rows:
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0)).reset_index(drop=True)
df.columns.name = None

# Remove rows with 'Not assigned' Borough:
df = df[df.Borough != 'Not assigned']
# Replace 'Not assigned' Neighbourhood to 'Borough name'
df.loc[df['Neighbourhood'] =='Not assigned', 'Neighbourhood'] = df['Borough']
df.reset_index( drop=True, inplace=True)

# Combine neighbourhoods for same postalcode
df2 = df.groupby('Postcode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join 
                              }).reset_index()

# Rename df2 columns to what is described in Assignment Instructions
df2.rename(columns={'Postcode':'PostalCode',
                          'Borough':'Borough',
                          'Neighbourhood':'Neighborhood'}, 
                inplace=True)

# Reorder the columns to the correct sequence since the use of dictionary above changed the order
df2_1 = df2[['PostalCode','Borough','Neighborhood']]
df2_1.head()
df2_1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [3]:
# Get the shape of df2_1
df2_1.shape

(103, 3)

In [4]:
# import the csv file with the latitude and longitude coordinates corresponding to postal code as dataframe df3
url = 'https://cocl.us/Geospatial_data'
df3 = pd.read_csv(url)
df3.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
#rename df3 column name 'Postal_Code'to 'PostalCode'
df3.columns= ['PostalCode', 'Latitude', 'Longitude']

# merge df2_1 and df3 on PostalCode
df_merge = pd.merge(df2_1, df3, on='PostalCode')
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [6]:
df_merge.shape

(103, 5)