# Segmenting and Clustering Neighborhoods in Toronto
## Part 1
Explore and cluster the neighborhoods in Toronto

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Get data from Wikipedia

In [7]:
uri = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(uri).text

soup = BeautifulSoup(data, 'xml')

In [8]:
table = soup.find('table')

column_names=['PostalCode','Borough','Neighborhood']
# Init dataframe
df = pd.DataFrame(columns=column_names)

#push information from the table inside dataframe
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


###  Ignore cells with a Borough that is "Not assigned"

In [10]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Group Neighborhoods in the same Borough

In [11]:
df = df.groupby(['PostalCode', 'Borough']).agg(', '.join)
df = df.reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If Neighborhood="Not assigned", make the value the same as Borough

In [12]:
for index, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df.head()   

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Print the number of rows of the dataframe

In [13]:
df.shape

(103, 3)

## Part 2
Get neighborhoods coordinates

### Load the coordinates from the csv file

In [15]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_1ad8ab0496214129b0ade50eaccc559e = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='nH7P_wvteUZKJyCKctKLM0d7mv6JLSE05nvAUAfLq2xH',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')

body = client_1ad8ab0496214129b0ade50eaccc559e.get_object(Bucket='coursera-donotdelete-pr-vc2achvd5gahjh',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )



In [16]:
#create DataFrame from csv file
geo_df = pd.read_csv(body)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
#rename the column "PostalCode"
geo_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

### Merge the two tables

In [21]:
#merge two table on the column "PostalCode"
df_merged = df.merge(geo_df, on="PostalCode", how="left")
df_merged = df_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
