# Webscraping

In [32]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

The 2023 (current year) CBI Shift information is available to download as csv from a publicly available dashboard. However, the dashboard format of the past year CBI data does not feature a convenient method to export the data. Therefore, I used webscraping to obtain the 2021 and 2022 shift details data. 

In [33]:
# The url for the 'past years' CBI data
url = 'https://maine.maps.arcgis.com/apps/dashboards/414d5af751e742ecb2e14796d6a1401a'
res = requests.get(url)

In [34]:
res

<Response [200]>

In [35]:
# The full source code for the website was not retrieved as much of the page was dynamically loaded Java script, not just html
print(res.content)

b'<!DOCTYPE html>\r\n<html>\r\n  <head>\r\n    <meta charset="utf-8" />\r\n    <meta http-equiv="X-UA-Compatible" content="IE=edge" />\r\n    <title>ArcGIS Dashboards</title>\r\n    <meta name="description" content="ArcGIS Dashboards" />\r\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\r\n    <meta http-equiv="Content-Security-Policy" content="" />\r\n    <link rel="icon" href="assets/images/arcgis-dashboards-16-57c7bdca757e2544acb67f661775b0ba.png" type="image/png" />\r\n\r\n    \n<link href="assets/arcgis-core/themes/dark/main.css" rel="stylesheet" type="text/css" data-theme="api-dark">\n<link href="assets/arcgis-core/themes/light/main.css" rel="stylesheet" type="text/css" data-theme="api-light">\r\n    <link rel="stylesheet" href="assets/vendor-2b39fb725719f7dbda5c70713940b9f6.css" />\r\n    <link rel="stylesheet" href="assets/app-3b56db0b8be782feb677676e31752c83.css" />\r\n    <script src="assets/before-amd-c876bbabc07a117e73a6b5e74c4dd779.js"></script>

In [36]:
soup = BeautifulSoup(res.content)

In [53]:
# I used selenium to interact with the webpage

url = 'https://maine.maps.arcgis.com/apps/dashboards/414d5af751e742ecb2e14796d6a1401a'

# Use a web driver (in this case, Chrome) to load the dynamic content
driver = webdriver.Chrome()
driver.get(url)

time.sleep(30)


In [43]:
# Get the page source after it has been fully loaded
page_source1 = driver.page_source

In [275]:
# Closed the browser
driver.quit()

Originally, I wanted to create a loop or loops that would pull the values I need from two tables on the webpage. Only 1 observation is displayed in the table at a time and there are buttons on the dashboard to access the next observation. 


**On the dashboard I would need to select a date range in this piece:**
<span class="text-ellipsis" title="Select Date or Date Range">      Select Date or Date Range    </span>  </div>\   <div class="font-light dashboard-secondary-text-color flex">      <span class="text-ellipsis" title="4/1/2021 - 10/31/2022">

    

**I would need to select the organization to be this:**
<span class="text-ellipsis" title="FOCW (Friends of the Cobbossee Watershed)">
        FOCW (Friends of the Cobbossee Watershed)
      </span>

**In the Shift DETAILS Table:**
<div class="caption">
        <p style="text-align:center"><span style="color:#004da8"><strong>SHIFT DETAILS</strong></span></p>

      </div>
<p style="text-align:center"><span style="color:#004da8"><strong>SHIFT DETAILS</strong></span></p>
<strong>SHIFT DETAILS</strong>

**From here:**
<table class="esri-widget__table" summary="List of attributes and values"><tbody><tr><th class="esri-feature-fields__field-header">AGENCY</th><td class="esri-feature-fields__field-data">Friends of the Cobbossee Watershed (FOCW)</td></tr><tr><th class="esri-feature-fields__field-header">SITE_NAME</th><td class="esri-feature-fields__field-data">East Winthrop Cobbosseecontee</td></tr><tr><th class="esri-feature-fields__field-header">DATE</th><td class="esri-feature-fields__field-data">09/25/2022</td></tr><tr><th class="esri-feature-fields__field-header">DATE1</th><td class="esri-feature-fields__field-data esri-feature-fields__field-data--date">September 25, 2022</td></tr><tr><th class="esri-feature-fields__field-header">DAY</th><td class="esri-feature-fields__field-data">Sun</td></tr><tr><th class="esri-feature-fields__field-header">INSPECTOR_ID</th><td class="esri-feature-fields__field-data">5648</td></tr><tr><th class="esri-feature-fields__field-header">PAY_VOL</th><td class="esri-feature-fields__field-data">Paid</td></tr><tr><th class="esri-feature-fields__field-header">START_SHIFT</th><td class="esri-feature-fields__field-data">07:00</td></tr><tr><th class="esri-feature-fields__field-header">END_SHIFT</th><td class="esri-feature-fields__field-data">17:00</td></tr><tr><th class="esri-feature-fields__field-header">INSPECTIONS</th><td class="esri-feature-fields__field-data">Yes</td></tr><tr><th class="esri-feature-fields__field-header">TOTALINSP</th><td class="esri-feature-fields__field-data">17</td></tr><tr><th class="esri-feature-fields__field-header">NUMINVASIVE</th><td class="esri-feature-fields__field-data">0</td></tr></tbody></table>

<th class="esri-feature-fields__field-header">AGENCY</th>
<td class="esri-feature-fields__field-data">Friends of the Cobbossee Watershed (FOCW)</td>


**I need to create a loop that will retrieve the following information:**

site_name: <th class="esri-feature-fields__field-header">SITE_NAME</th>
date: <th class="esri-feature-fields__field-header">DATE</th>
day: <th class="esri-feature-fields__field-header">DAY</th>
inspector_id: <th class="esri-feature-fields__field-header">INSPECTOR_ID</th>
paid: <th class="esri-feature-fields__field-header">PAY_VOL</th>
start_shift: <th class="esri-feature-fields__field-header">START_SHIFT</th>
end_shift: <th class="esri-feature-fields__field-header">END_SHIFT</th>
inspections: <th class="esri-feature-fields__field-header">INSPECTIONS</th>
total_insp: <th class="esri-feature-fields__field-header">TOTALINSP</th>
numinvasive: <th class="esri-feature-fields__field-header">NUMINVASIVE</th>

**And store it in a dictionary**

item = {'SITE_NAME': site_name, 'DATE1': date, 'DAY': day, 'START_SHIFT': start_shift, 'END_SHIFT': end_shift, 'INSPECTOR_ID': inspector_id, 'PAID': paid, 'TOTALINSP': total_insp, 'NUMINVASIVE': numinvasive}


In [None]:
url = 'your_url_here'
date_range = "4/1/2021 - 10/31/2022"
organization = "FOCW (Friends of the Cobbossee Watershed)"


scrape_page(url, start_date, end_date, organization)

In [None]:
# From Tim

url =
driver.get(url)
time.sleep(10)

td class 

In [51]:
from selenium.webdriver.common.by import By

In [66]:
# Can also load the page, tell it to wait, click the buttons i need then use the loop to keep clicking buttons

driver = webdriver.Chrome()
driver.get(url)
time.sleep(15)
print("Done sleeping")

# Manually click the date range and org

Done sleeping


In [55]:
# loop through each page
source = driver.page_source
soup = BeautifulSoup(source)

In [62]:
#soup

In [63]:
data = []

In [156]:
source = driver.page_source
soup = BeautifulSoup(source)

site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

# Create a dictionary
item = {
    'SITE_NAME': site_name,
    'DATE': date,
    'DAY': day,
    'INSPECTOR_ID': inspector_id,
    'PAID': paid,
    'START_SHIFT': start_shift,
    'END_SHIFT': end_shift,
    'INSPECTIONS': inspections,
    'TOTALINSP': total_insp,
    'NUMINVASIVE': numinvasive
}

data.append(item)


In [152]:
df = pd.DataFrame(data)

In [154]:
df.shape

(79, 10)

In [153]:
df.tail()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
74,Launch Drive,08/07/2021,Sat,5105,Paid,07:00,13:00,Yes,60,
75,Wilson Pond Road,08/07/2021,Sat,5386,Paid,13:00,19:00,Yes,14,
76,East Winthrop,08/07/2021,Sat,3504,Paid,07:00,17:00,Yes,23,
77,Whippoorwill Road,08/07/2021,Sat,4771,Paid,13:00,19:00,Yes,75,
78,Augusta West Kampground,08/07/2021,Sat,4769,Paid,14:00,20:00,Yes,12,


In [162]:
sites = []

In [158]:
driver = webdriver.Chrome()
driver.get(url)
time.sleep(15)
print("Done sleeping")

# Manually click the date range and org

Done sleeping


In [163]:
# Once the window opends, I manually selected Date Range of 4/1/2021 - 8/15/2021 and Organization FOCW
# 490 observations within those filters

time.sleep(2)

#During the 2 sections I active an auto clicker to click the button on the table in the window

# Number of times I want this code to run
num_iterations=490

for _ in range(num_iterations):
    # Your scraping code
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')

    #Retrieve the features I need
    site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
    date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
    day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
    inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
    paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
    start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
    end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
    inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
    total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
    numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

    # Create a dictionary of these features
    item = {
        'SITE_NAME': site_name,
        'DATE': date,
        'DAY': day,
        'INSPECTOR_ID': inspector_id,
        'PAID': paid,
        'START_SHIFT': start_shift,
        'END_SHIFT': end_shift,
        'INSPECTIONS': inspections,
        'TOTALINSP': total_insp,
        'NUMINVASIVE': numinvasive
    }
    
    # Append this observation to the list of shifts
    sites.append(item)
    
    # Convert to a dataframe
    sites_df=pd.DataFrame(sites)
    
    # Use a printout to check the progress as the code runs
    print(sites_df.shape)

    # Wait for 3 seconds before the next iteration
    time.sleep(3)

# Print or process the collected data as needed
print(done)

(1, 10)
(2, 10)
(3, 10)
(4, 10)
(5, 10)
(6, 10)
(7, 10)
(8, 10)
(9, 10)
(10, 10)
(11, 10)
(12, 10)
(13, 10)
(14, 10)
(15, 10)
(16, 10)
(17, 10)
(18, 10)
(19, 10)
(20, 10)
(21, 10)
(22, 10)
(23, 10)
(24, 10)
(25, 10)
(26, 10)
(27, 10)
(28, 10)
(29, 10)
(30, 10)
(31, 10)
(32, 10)
(33, 10)
(34, 10)
(35, 10)
(36, 10)
(37, 10)
(38, 10)
(39, 10)
(40, 10)
(41, 10)
(42, 10)
(43, 10)
(44, 10)
(45, 10)
(46, 10)
(47, 10)
(48, 10)
(49, 10)
(50, 10)
(51, 10)
(52, 10)
(53, 10)
(54, 10)
(55, 10)
(56, 10)
(57, 10)
(58, 10)
(59, 10)
(60, 10)
(61, 10)
(62, 10)
(63, 10)
(64, 10)
(65, 10)
(66, 10)
(67, 10)
(68, 10)
(69, 10)
(70, 10)
(71, 10)
(72, 10)
(73, 10)
(74, 10)
(75, 10)
(76, 10)
(77, 10)
(78, 10)
(79, 10)
(80, 10)
(81, 10)
(82, 10)
(83, 10)
(84, 10)
(85, 10)
(86, 10)
(87, 10)
(88, 10)
(89, 10)
(90, 10)
(91, 10)
(92, 10)
(93, 10)
(94, 10)
(95, 10)
(96, 10)
(97, 10)
(98, 10)
(99, 10)
(100, 10)
(101, 10)
(102, 10)
(103, 10)
(104, 10)
(105, 10)
(106, 10)
(107, 10)
(108, 10)
(109, 10)
(110, 10)
(111, 10

NameError: name 'done' is not defined

In [164]:
sites_df.tail()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
485,Thorofare Rd,05/29/2021,Sat,4944,Paid,07:00,17:00,Yes,11,
486,Whippoorwill Road,05/28/2021,Fri,4174,Paid,12:00,18:00,Yes,13,
487,Augusta West Kampground,05/28/2021,Fri,4769,Paid,12:00,18:00,Yes,1,
488,East Winthrop,05/28/2021,Fri,4485,Paid,12:00,18:00,Yes,2,
489,Launch Drive,05/28/2021,Fri,4771,Paid,12:00,18:00,Yes,33,


In [166]:
sites_df.drop_duplicates()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
0,Augusta West Kampground,08/15/2021,Sun,4769,Paid,08:00,18:00,Yes,13,
1,Launch Drive,08/15/2021,Sun,3796,Paid,07:00,17:00,Yes,95,
2,Old Kents Hill Road,08/15/2021,Sun,5006,Paid,13:00,19:00,Yes,4,
3,Wilson Pond Road,08/15/2021,Sun,4174,Paid,13:00,19:00,Yes,15,
4,Rt 41 North Basin Maranacook,08/15/2021,Sun,5353,Paid,13:00,19:00,Yes,32,
...,...,...,...,...,...,...,...,...,...,...
485,Thorofare Rd,05/29/2021,Sat,4944,Paid,07:00,17:00,Yes,11,
486,Whippoorwill Road,05/28/2021,Fri,4174,Paid,12:00,18:00,Yes,13,
487,Augusta West Kampground,05/28/2021,Fri,4769,Paid,12:00,18:00,Yes,1,
488,East Winthrop,05/28/2021,Fri,4485,Paid,12:00,18:00,Yes,2,


In [179]:
sites_df.to_csv('data/shift_details1.csv', index= False)

In [249]:
# Followed the same steps as above but selected date range: 8/1/2021 - 7/1/2022 (485 shifts)

time.sleep(7)

num_iterations=500

for _ in range(num_iterations):
    # Your scraping code
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')

    site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
    date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
    day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
    inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
    paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
    start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
    end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
    inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
    total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
    numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

    # Create a dictionary
    item = {
        'SITE_NAME': site_name,
        'DATE': date,
        'DAY': day,
        'INSPECTOR_ID': inspector_id,
        'PAID': paid,
        'START_SHIFT': start_shift,
        'END_SHIFT': end_shift,
        'INSPECTIONS': inspections,
        'TOTALINSP': total_insp,
        'NUMINVASIVE': numinvasive
    }

    sites.append(item)
    
    sites_df=pd.DataFrame(sites)
    #print(sites_df.shape)

    # Wait for 3 seconds before the next iteration
    time.sleep(1.5)

sites_df.drop_duplicates()
# Print or process the collected data as needed
print(sites_df.shape)
print('done')

(1591, 10)


NameError: name 'done' is not defined

In [260]:
sites_df.drop_duplicates().shape

(1323, 10)

In [259]:
sites_df.drop_duplicates(inplace=True)

In [231]:

sites_df.tail()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
1175,East Winthrop Cobbosseecontee,06/27/2022,Mon,5573,Paid,14:00,19:00,Yes,3,0
1176,Launch Drive Cobbosseecontee,06/27/2022,Mon,5581,Paid,13:00,19:00,Yes,1,0
1177,East Winthrop Cobbosseecontee,06/27/2022,Mon,5586,Paid,07:00,11:00,No,0,0
1178,Launch Drive Cobbosseecontee,06/27/2022,Mon,5637,Volunteer,05:00,09:00,Yes,28,0
1179,Launch Drive Cobbosseecontee,06/27/2022,Mon,5580,Paid,07:00,11:40,Yes,1,0


In [245]:
# Ran this just once instead of in for loop for pages of the table that were missed due to timining issues

source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')

site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

    # Create a dictionary
item = {
    'SITE_NAME': site_name,
    'DATE': date,
    'DAY': day,
    'INSPECTOR_ID': inspector_id,
    'PAID': paid,
    'START_SHIFT': start_shift,
    'END_SHIFT': end_shift,
    'INSPECTIONS': inspections,
    'TOTALINSP': total_insp,
    'NUMINVASIVE': numinvasive
    }

sites.append(item)

In [254]:
sites_df=pd.DataFrame(sites)

In [272]:
sites_df.drop_duplicates(inplace=True)

In [273]:
sites_df.shape

(1642, 10)

In [221]:
sites_df.head()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
0,Augusta West Kampground,08/15/2021,Sun,4769,Paid,08:00,18:00,Yes,13,
1,Launch Drive,08/15/2021,Sun,3796,Paid,07:00,17:00,Yes,95,
2,Old Kents Hill Road,08/15/2021,Sun,5006,Paid,13:00,19:00,Yes,4,
3,Wilson Pond Road,08/15/2021,Sun,4174,Paid,13:00,19:00,Yes,15,
4,Rt 41 North Basin Maranacook,08/15/2021,Sun,5353,Paid,13:00,19:00,Yes,32,


In [223]:
sites_df.reset_index(drop=True, inplace=True)

In [224]:
sites_df.head()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
0,Augusta West Kampground,08/15/2021,Sun,4769,Paid,08:00,18:00,Yes,13,
1,Launch Drive,08/15/2021,Sun,3796,Paid,07:00,17:00,Yes,95,
2,Old Kents Hill Road,08/15/2021,Sun,5006,Paid,13:00,19:00,Yes,4,
3,Wilson Pond Road,08/15/2021,Sun,4174,Paid,13:00,19:00,Yes,15,
4,Rt 41 North Basin Maranacook,08/15/2021,Sun,5353,Paid,13:00,19:00,Yes,32,


In [264]:
# if code throws up an error due to the time it took to load a page, check the tail to see where it left off
sites_df.tail()

Unnamed: 0,SITE_NAME,DATE,DAY,INSPECTOR_ID,PAID,START_SHIFT,END_SHIFT,INSPECTIONS,TOTALINSP,NUMINVASIVE
2221,Lakeside Marina Cobbosseecontee,08/27/2022,Sat,5669,Paid,08:45,10:30,Yes,3,0
2222,Lakeside Marina Cobbosseecontee,08/27/2022,Sat,5657,Paid,12:50,12:51,Yes,1,0
2223,Augusta West Kampground,08/27/2022,Sat,4174,Paid,08:00,18:00,Yes,8,0
2224,Lakeside Marina Cobbosseecontee,08/27/2022,Sat,5652,Paid,20:08,20:09,Yes,1,0
2225,Launch Drive Cobbosseecontee,08/27/2022,Sat,5637,Volunteer,04:58,07:05,Yes,13,0


In [258]:
# After selecting date range: 7/2/2022 - 8/10/2022 (485 shifts)

time.sleep(7)

num_iterations= 500

print('start')
for _ in range(num_iterations):
    # Your scraping code
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')

    site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
    date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
    day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
    inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
    paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
    start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
    end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
    inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
    total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
    numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

    # Create a dictionary
    item = {
        'SITE_NAME': site_name,
        'DATE': date,
        'DAY': day,
        'INSPECTOR_ID': inspector_id,
        'PAID': paid,
        'START_SHIFT': start_shift,
        'END_SHIFT': end_shift,
        'INSPECTIONS': inspections,
        'TOTALINSP': total_insp,
        'NUMINVASIVE': numinvasive
    }

    sites.append(item)
    
    sites_df=pd.DataFrame(sites)
    #print(sites_df.shape)

    # Wait for 3 seconds before the next iteration
    time.sleep(3)

sites_df.drop_duplicates(inplace=True)
# Print or process the collected data as needed
print(sites_df.shape)
print('done')

start
(2063, 10)
done


In [270]:
driver = webdriver.Chrome()
driver.get(url)

In [271]:
# After selecting date range: 7/28/2022 - 12/31/2022 (485 shifts)

time.sleep(7)

num_iterations= 500

print('start')
for _ in range(num_iterations):
    # Your scraping code
    source = driver.page_source
    soup = BeautifulSoup(source, 'html.parser')

    site_name = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='SITE_NAME').find_next('td').text.strip()
    date = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DATE').find_next('td').text.strip()
    day = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='DAY').find_next('td').text.strip()
    inspector_id = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTOR_ID').find_next('td').text.strip()
    paid = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='PAY_VOL').find_next('td').text.strip()
    start_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='START_SHIFT').find_next('td').text.strip()
    end_shift = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='END_SHIFT').find_next('td').text.strip()
    inspections = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='INSPECTIONS').find_next('td').text.strip()
    total_insp = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='TOTALINSP').find_next('td').text.strip()
    numinvasive = soup.find('th', {'class': 'esri-feature-fields__field-header'}, string='NUMINVASIVE').find_next('td').text.strip()

    # Create a dictionary
    item = {
        'SITE_NAME': site_name,
        'DATE': date,
        'DAY': day,
        'INSPECTOR_ID': inspector_id,
        'PAID': paid,
        'START_SHIFT': start_shift,
        'END_SHIFT': end_shift,
        'INSPECTIONS': inspections,
        'TOTALINSP': total_insp,
        'NUMINVASIVE': numinvasive
    }

    sites.append(item)
    
    sites_df=pd.DataFrame(sites)
    #print(sites_df.shape)

    # Wait for 3 seconds before the next iteration
    time.sleep(3)

sites_df.drop_duplicates(inplace=True)
# Print or process the collected data as needed
print(sites_df.shape)
print('done')

start
(2586, 10)
done


In [None]:
driver = webdriver.Chrome()
driver.get(url)