# Part 1 getting the City of Toronto data from Wikipedia

Install the beautiful soup module

In [13]:
!pip install bs4

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


Handle the imports

In [14]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Build the URL

In [15]:
# create the Wikipedia URL
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [None]:
Get the web page contents

In [16]:
wikidata = requests.get(url).text

Create a Beautiful Soup object containing the wikipedia data

In [17]:
soup = BeautifulSoup(wikidata,"html5lib")

create a list of the html tables in the BeautifulSoup object

In [18]:
table_list = soup.find_all('table')
table_index = -1
for index,table in enumerate(table_list):
    if ("Toronto" in str(table)):
        table_index = index

if (table_index == -1):
    print("Table not found")
else:
    print("The table index we require is ", table_index)

The table index we require is  0


create a dictionary to hold the burough and neighborhood data from the table identified above 

In [19]:
toronto_neighborhoods=[]        # empty dictionary
table=table_list[table_index]

Data Cleansing:Iterate through the table. Ignore cells where the row contains 'Not assigned'

In [20]:

for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] # postal code contains 3 characters
        cell['Borough'] = (row.span.text).split('(')[0]    # get the borough
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        toronto_neighborhoods.append(cell)



Create a dataframe containing the Toronto Neighborhood data scraped from the Wikipedia table
Further cleanse the borough column 

In [21]:
# print(table_contents)
df=pd.DataFrame(toronto_neighborhoods)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest',
                                             'East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


Use the .shape method to print the number of rows in the dataframe

In [22]:
df.shape

(103, 3)

# Part 2 - Getting Latitude/Longitude coordinates for the Toronto postal codes

Use the provided csv (I had difficulty with the geocoder package) to create a dataframe with 5 columns: PostalCode, Borough, Neighborhood, Latitude, Longitude

<b>First get the csv file using wget</b>

In [28]:
!wget -O Geospatial_Coordinates.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv

--2021-07-14 20:43:47--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2788 (2.7K) [text/csv]
Saving to: ‘Geospatial_Coordinates.csv’


2021-07-14 20:43:47 (33.2 MB/s) - ‘Geospatial_Coordinates.csv’ saved [2788/2788]



<b>Use Pandas read_csv to read the provided data file</b>

In [39]:
coord_df = pd.read_csv("Geospatial_Coordinates.csv")
coord_df.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


<b>Rename "Postal Code" to have the same as name the column in the data frame from part 1</b>

In [None]:
coord_df.rename(columns={'Postal Code': 'PostalCode'})

<b> Merge the two dataframes on the postal code column </b>

In [85]:
dfToronto = df.merge(coord_df, left_on='PostalCode', right_on='Postal Code')

<b>Drop the redundant "Postal Code" column</b>


In [86]:
dfToronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,M7Y,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y,43.636258,-79.498509



<b>Drop the redundant "Postal Code" column</b>


In [87]:
dfToronto.drop(columns=['Postal Code'])

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


<b>Display rows 0 through 11 and ensure they match the instructions</b>

In [88]:
dfToronto.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
7,M3B,North York,Don Mills North,M3B,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",M4B,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937
