This notebook presents a solution to the Neighborhood Segmentation project outlined in Week 3 of the IBM Applied Data Science Capstone Course on Coursers

Created 5/21/19 by K Fullerton

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from bs4 import BeautifulSoup
import requests, json

# Web Scraping to Collect Neighborhood Data

## Create the page object

In [2]:
url_to_scrape = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url_to_scrape)
page

<Response [200]>

<Respone [200]> Indicates that the page object was created and requested successfully.

## Create the Pandas dataframe in which to store the data
And use the .head() command to check its format

In [3]:
code_list = list()
borough_list = list()
neighborhood_list = list()

## Use Beautiful Soup to scrape the table data into the pandas dataframe

In [4]:
soup = BeautifulSoup(page.content,'html.parser')
for tr in soup.find_all('tr')[1:287]:
    tds = tr.find_all('td')
#     print(tds[0].text)
#     print(tds[1].text)
#     print(tds[2].text)
    code_list.append(tds[0].text)
    borough_list.append(tds[1].text)
    neighborhood_list.append(tds[2].text)
neighborhood_list = list(map(lambda s: s.strip(), neighborhood_list))
zippedList = list(zip(code_list, borough_list, neighborhood_list))
toronto_data = pd.DataFrame(zippedList, columns=['PostalCode', 'Borough', 'Neighborhood'])
    
toronto_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


## Clean the dataframe by removing any postal codes that have not been assigned

In [5]:
toronto_data = toronto_data[toronto_data.Borough != 'Not assigned']
toronto_data.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Assign borough name for any not assigned neighborhood names

In [6]:
# Find the indices for rows where Neighborhood is Not assigned
missing_indices = toronto_data[toronto_data.Neighborhood == 'Not assigned']
# Iterate over those indices to replace the not assigned with the borough name
for i, row in missing_indices.iterrows():
    toronto_data.at[i, 'Neighborhood'] = row.Borough
toronto_data.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Check the data frame to make sure that there are no remaining "not assigned" values.

Note that this secondary check of the "missing indices" list should result in an empty data frame- when we perform this second check, all of the not assigned neighborhoods should have been replaced with the borough name.

In [7]:
missing_indices = toronto_data[toronto_data.Neighborhood == 'Not assigned']
print(missing_indices)

Empty DataFrame
Columns: [PostalCode, Borough, Neighborhood]
Index: []


## Merge Neighborhood Names for postal codes with multiple neighborhoods listed

In [8]:
def concat_string_values(group):
    string = ''
    for name in group.Neighborhood:
        string += name + ' '
        
    return string

grouped_data = toronto_data.groupby(['PostalCode','Borough']).apply(concat_string_values) 

In [9]:
cleaned_toronto_data = pd.DataFrame(grouped_data.reset_index())
cleaned_toronto_data.rename(columns={0:'Neighborhoods'},  inplace=True)
cleaned_toronto_data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1B,Scarborough,Rouge Malvern
1,M1C,Scarborough,Highland Creek Rouge Hill Port Union
2,M1E,Scarborough,Guildwood Morningside West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,East Birchmount Park Ionview Kennedy Park
7,M1L,Scarborough,Clairlea Golden Mile Oakridge
8,M1M,Scarborough,Cliffcrest Cliffside Scarborough Village West
9,M1N,Scarborough,Birch Cliff Cliffside West


# Collecting Geographic Data

# Clustering Analysis