# Web Parsing for City of Toronto Supervised Injection Sites and Needle Drop Boxes

In [None]:
# Install the utils package
!pip install utils

Import Required Packages

In [106]:
# Import 3rd party libraries
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pylab as plt

# Import local libraries
import utils

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False

### Web parsing supervised injection sites

Get the HTML for the website url

In [107]:
url = "https://www.toronto.ca/community-people/health-wellness-care/health-programs-advice/supervised-injection-services/"
response = requests.get(url)
response

<Response [200]>

Parse the HTML object

In [108]:
soup = BeautifulSoup(response.text, 'html.parser')

View the HTML entries in the table

In [109]:
table = soup.find_all('td')
print(table[0:2])

[<td><strong>Fred Victor*</strong><br/>
139 Jarvis St.<br/>
(Queen Street East/Jarvis Street)<br/>
Phone: 416-644-3081</td>, <td>Monday, 8 a.m. to 10 p.m. <em>last call </em><br/>
Tuesday to Sunday, 7:30 a.m. to 7 p.m. <em>last call </em><em> </em></td>]


View one cell.

In [110]:
print(table[0].get_text(separator = " ").strip())

Fred Victor* 
139 Jarvis St. 
(Queen Street East/Jarvis Street) 
Phone: 416-644-3081


### Parse the strings

We want to parse and extract the second line from each cell for the addresses and clean the location names from location_names.

In [111]:
# return the human-readable text from each row and concatenate with a space, then split at the spaces.
text_cell = [cell.get_text(separator = " ").strip() for cell in table]

# Extract the cells from the first column, which contain the name and location information.
location_info_cell = [text_cell[index].split(("\n")) for index in range(0,len(text_cell),2)]

# Extract the 2nd (address) and 3rd (intersection) from each cell in text_cell becuase two of the cells have address in the 3rd line
address_list = [strng[1] + strng[2] for strng in location_info_cell]

# Clean the data by removing noise ("The Works", "KeepSIX*" from the second lines)
address_list_cleaned = [entry.split("(")[0].strip("The Works").strip("KeepSIX*").split("(")[0].strip() for entry in address_list]

# Extract the names from the location_names list
# final_loc_names = [cell.get_text(separator = " ").strip().strip('*') for cell in location_names]
final_loc_names = [name[0].strip().strip("*").strip() for name in location_info_cell]

# Create the DataFrame
injection_site_locations_df = pd.DataFrame({
    "Location Name": final_loc_names,
    "Address": address_list_cleaned
})

# View DataFrame
injection_site_locations_df.head()

Unnamed: 0,Location Name,Address
0,Fred Victor,139 Jarvis St.
1,Moss Park CTS,134 Sherbourne St.
2,Parkdale Queen West Community Health Centre,168 Bathurst St.
3,Parkdale Queen West Community Health Centre,1229 Queen St. W.
4,"Regent Park Bevel Up CTS Site, Regent Park Com...",465 Dundas St. E.


# Part 2: Web Parsing for Needle Drop Box Locations

Using the same method from above

In [112]:
# Extract the HTML from the url and parse using BeautifulSoup
url = "https://www.toronto.ca/community-people/health-wellness-care/health-programs-advice/harm-reduction-supplies-and-locations/needle-drop-box-locations-in-toronto/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [113]:
# Extract the data from the table
table_data = soup.find_all('td')
print(table_data[0:2])

[<td>The Works</td>, <td><p>277 Victoria St.</p>
<p>Victoria St. &amp; Dundas St. E.</p>
</td>]


View the properties of the HTML

In [114]:
# Check the number of entries
print(len(table_data))

86


In [115]:
# View what each entry in table_data is
for cell in table_data[0:3]:
    print(cell)
    print()

<td>The Works</td>

<td><p>277 Victoria St.</p>
<p>Victoria St. &amp; Dundas St. E.</p>
</td>

<td>Casey House</td>



Each entry is a single cell, the table is 86 rows by 2 columns so it alternates between the Title (location name) column and Description (address) column.

### Parse the strings

We want to parse and extract every other entry in table_data starting from index 0 for location name, and 1 for address.

In [116]:
# Convert to human readable text
text_cell = [cell.get_text(separator = " ").strip().split("\n") for cell in table_data]

# Extract every other entry in text_cell beginning from 0th index
location_name = [text_cell[index][0] for index in range(0, len(text_cell), 2)]

# Extract all other entries, indexing from the 1st index
address = [text_cell[index][0].replace("\xa0", " ").strip() for index in range(1, len(text_cell), 2)]

# Create the DataFrame
needle_dropbox_df = pd.DataFrame({
    "Location Name": location_name,
    "Address": address
})

# View DataFrame
needle_dropbox_df.head()

Unnamed: 0,Location Name,Address
0,The Works,277 Victoria St.
1,Casey House,119 Isabella St.
2,Dixon Hall,2714 Danforth Ave.
3,Eva's Satelite,25 Canterbury Place
4,Fred Victor,145 Queen St. E.


Save the DataFrames as csvs.

In [117]:
# Save DataFrames as csvs
injection_site_locations_df.to_csv("injection_site_locations.csv", index=False)
needle_dropbox_df.to_csv("needle_dropbox_locations.csv", index = False)