## Applied Data Science Capstone Project
### Week 3 - Segmenting and Clustering Neighborhoods in Toronto
### Kevin Spradlin
### July 12, 2021

## Step 1 - Webscrape Postal Codes, Boroughs, and Neighborhoods

In [4]:
!pip install bs4
!pip install html5lib
#!pip install requests



In [5]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

In [6]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

data  = requests.get(url).text

#soup = BeautifulSoup(data,"html5lib")
soup = BeautifulSoup(data,"html.parser")


In [7]:
#print(soup.prettify())

#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

len(tables)


3

In [8]:
# find the table with the postal codes
for index,table in enumerate(tables):
    if ("Not assigned" in str(table)):
        table_index = index
print(table_index)

#print(tables[table_index].prettify())


0


In [9]:
# extract the postal codes, boroughs, and neighborhoods into a list
table_contents = []

for row in tables[table_index].findAll("td"):
  cell = {}

  if row.span.text == "Not assigned":
    pass
  else:
    cell["PostalCode"] = row.p.text[:3]
    cell["Borough"] = (row.span.text).split('(')[0]
    cell["Neighborhood"] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
    table_contents.append(cell)

#print(table_contents)


In [10]:
# convert the list with postal codes, boroughs, and neighborhoods into a pandas dataframe
df = pd.DataFrame(table_contents)

#df.Borough.unique()

df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest',
                                     'East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

#df.Borough.unique()


In [11]:
# print the dimensions of the dataframe
df.shape

(103, 3)

### The dataframe has 103 rows (postal codes for boroughs and neighborhoods).

## Step 2 - Get the latitude and longitude of each postal code

In [2]:
!pip install geocoder
import geocoder


Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 768 kB/s 
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 9.1 MB/s 
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Building wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491058 sha256=ee2574d27fb93307dbed5348e98135ecf65bf63e0f7fccb3641e357965012f8f
  Stored in directory: /home/kc/.cache/pip/wheels/8e/70/28/3d6ccd6e315f65f245da085482a2e1c7d14b90b30f239e2cf4
Successfully built future
Installing collected packages: future, ratelim, geocoder
Successfully installed future-0.18.2 geocoder-1.38.1 ratelim-0.1.6


In [13]:
# loop through the rows in the dataframe, putting the combined results into a new dataframe
full_table_contents = []


for index, row in df.iterrows():
  #print(row['PostalCode'])

  cell = {}
  cell["PostalCode"] = row["PostalCode"]  
  cell["Borough"] = row["Borough"]  
  cell["Neighborhood"] = row["Neighborhood"]  


  lat_long_coords = None

  # keep querying until you get coordinates for the postal code
  while lat_long_coords is None:
    geo_query = geocoder.google(f"{row['PostalCode']:s}, Toronto, Ontario")
    lat_long_coords = geo_query.latlng


  cell["Latitude"] = lat_long_coords[0]
  cell["Longitude"] =lat_long_coords[1]

  full_table_contents.append(cell)


full_df = pd.DataFrame(full_table_contents)

full_df.head()


KeyboardInterrupt: 

### Note - I let the above cell run for over a minute and it didn't complete.  So, I created a new code cell (below) to query one postal code.  It ran for two minutes and didn't complete.  I decided to switch to the csv file to get the latitudes and longitudes.

In [17]:
# note - tested geocoder with one postal code.  I didn't get a response after 2 minutes.
lat_long_coords = None

postal_code = "M5A"

while lat_long_coords is None:
  geo_query = geocoder.google("{postal_code:s}, Toronto, Ontario")
  lat_long_coords = geo_query.latlng


print(lat_long_coords)
  

In [22]:
# read the coordinates from the csv file into a dictionary.
lat_long_coords = {}

geodata = open('Geospatial_Coordinates.csv', 'r')

for curr_line in geodata:
  postal_code, latitude, longitude = curr_line.split(',')

  lat_long_coords[postal_code] = (latitude, longitude[:-1], )


geodata.close()


In [23]:
# loop through the rows in the dataframe, putting the combined postal code/borough/neighborhood/latitude and longitude information into a new dataframe
full_table_contents = []


for index, row in df.iterrows():
  cell = {}
  cell["PostalCode"] = row["PostalCode"]  
  cell["Borough"] = row["Borough"]  
  cell["Neighborhood"] = row["Neighborhood"]  

  if row["PostalCode"] in lat_long_coords:
    cell["Latitude"] = lat_long_coords[row["PostalCode"]][0]
    cell["Longitude"] = lat_long_coords[row["PostalCode"]][1]

  full_table_contents.append(cell)


full_df = pd.DataFrame(full_table_contents)

full_df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7532586,-79.3296565
1,M4A,North York,Victoria Village,43.7258823,-79.3155716
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6542599,-79.3606359
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.4647633
4,M7A,Queen's Park,Ontario Provincial Government,43.6623015,-79.3894938


In [24]:
full_df.shape

(103, 5)

### The new dataframe has 103 rows, or the same number as the original dataframe.  So no rows were left out.

## Step 3 - 