In [1]:
import pandas as pd
import numpy as np

# This notebook is for the IBM Data Science Capstone Project by Kasey Chang started 2020-02-12 1436

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


In [3]:
#install beautifulsoup4 if needed
#!pip install beautifulsoup4

### Set webpage source and import that content into scraped 

### then filter with BeautifulSoup with html parser, find the first table with class wikitable, using [0]

In [4]:
source_html="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

from bs4 import BeautifulSoup
import requests

res = requests.get(source_html)

scraped = res.content

soup = BeautifulSoup(scraped,"html.parser")

table=soup.findAll("table",{"class":"wikitable"})[0]

### We use this table parser to look for the table headers TH and use them as column names, make sure to use strip to take out misc \n and such

### Then we look for the table contents by looking for TR and TD, also use strip to clean up the misc \n that can be found

In [5]:
            n_columns = 0
            n_rows=0
            column_names = []
    
            # Find number of rows and columns
            # we also find the column titles if we can
            for row in table.find_all('tr'):
                
                # Determine the number of rows in the table
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        # Set the number of columns for our table
                        n_columns = len(td_tags)
                        
                # Handle column names if we find them
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text().strip())
    
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text().strip()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
                       
                

In [6]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### We look for any cases where Neighbourhood is 'Not assigned' but we have a 'Borough' value

In [7]:
dfx=df.loc[(df['Neighbourhood']=='Not assigned') & (df['Borough'] != 'Not assigned')]
dfx

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Not assigned


### We fix this special case

In [8]:
#dfx.index[0]
df.at[dfx.index[0],'Neighbourhood']=dfx.at[dfx.index[0],'Borough']


### And we verified that's fixed by looking at 9

In [9]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Queen's Park


### Now we can drop the rows that still have Borough == "Not assigned"

In [10]:
#df2.drop_duplicates(subset="Neighbourhood",keep=False,inplace=True)
df.drop(df[df.Borough=="Not assigned"].index,inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### We can now group the neighbourhoods that share the same Borough and Postcode

In [11]:
df = df.groupby(['Postcode','Borough']).agg(','.join)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [12]:
df.reset_index()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [13]:
df.shape

(103, 1)