# Import Pandas and Numpy

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

# Parsing HTML Source Code to Form the Required Table Using lxml, requests, and Beautiful Soup

In [2]:
# Creating DataFrame for the table found in the URL below
from lxml import html
import requests
from bs4 import BeautifulSoup



# Getting a request from the URL and making a BeautifulSoup object for the HTML source code
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(page.text, 'html.parser')

# Creating BeautifulSoup objects for the table for the DataFrame
soup_table = soup.find("table", class_="wikitable sortable").find_all('tr')

soup_header = soup_table[0].find_all('th')

# Creating a table for the DataFrame
table = np.empty((len(soup_table), len(soup_header))).astype('str')


for row_index, row in enumerate(soup_table):
    for column_index, column in enumerate(soup_header):
        table[row_index] = list(text for text in soup_table[row_index].stripped_strings)
        
table = table.transpose()
        
# Creating the DataFrame
df = pd.DataFrame({table[0][0]: table[0][1:],
                   table[1][0]: table[1][1:],
                   table[2][0]: table[2][1:]})
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Removing Rows from Table with 'Not assigned' Values in 'Borough' Column

In [3]:
df.drop(df.loc[df['Borough']=='Not assigned'].index, inplace=True)
df = df.reset_index(drop=True)
df.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


# Showing that there are NO Duplicate 'Postal Code' Values, but there ARE Duplicate 'Neighborhood' Values, but NO 'Not assigned' 'Neighborhood' Values

In [4]:
# This code shows that there are no duplicates in Postal Code and so we do not need to group different neighborhoods of the same Postal Code
print('Number of rows: ' + str(df.shape[0]))
print('Number of unique Postal Code labels: ' + str(df.groupby('Postal Code').ngroups))

# However, there are duplicate Neighborhoods, so we need to be sure to group by both the Postal Code and Neighborhood when trying to group by Neighborhoods.
print('Number of unique Neighborhood labels: ' + str(df.groupby('Neighborhood').ngroups))

# This code shows that there are no 'Not assigned' values in the Neighborhood column and so we do not need to assign these to any value in the Borough column
df.loc[df['Neighborhood'] == 'Not assigned']

Number of rows: 103
Number of unique Postal Code labels: 103
Number of unique Neighborhood labels: 99


Unnamed: 0,Postal Code,Borough,Neighborhood


# Getting Latitude/Longitude Data from given csv File since geocoder was not working

In [5]:
# Read lat, lng data from csv file since geocoder was not functioning
lat_lng_df = pd.read_csv('http://cocl.us/Geospatial_data')

toronto_data = df.merge(lat_lng_df)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial",43.662301,-79.389494


# Keeping only the Data for Boroughs that contain 'Toronto' in their Name

In [8]:
# Keep only data for the Boroughs that end in 'Toronto'.
toronto_data = toronto_data[toronto_data['Borough'].str.contains("Toronto")]
toronto_data = toronto_data.reset_index(drop=True)
toronto_data.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


# Checking for Duplicate Neighborhoods

In [7]:
# This code shows that there are no duplicates in Postal Code and so we do not need to group different neighborhoods of the same Postal Code
print('Number of rows: ' + str(toronto_data.shape[0]))
print('Number of unique Postal Code labels: ' + str(toronto_data.groupby('Postal Code').ngroups))

# However, there are duplicate Neighborhoods, so we need to be sure to group by both the Postal Code and Neighborhood when trying to group by Neighborhoods.
print('Number of unique Neighborhood labels: ' + str(toronto_data.groupby('Neighborhood').ngroups))


Number of rows: 39
Number of unique Postal Code labels: 39
Number of unique Neighborhood labels: 39
