In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from selenium.webdriver.chrome.options import Options
import random
import time
from webdriver_manager.chrome import ChromeDriverManager
from googleapiclient.discovery import build
import json

driver_path = 'chromedriver.exe'
SCOPES = ["https://www.googleapis.com/auth/spreadsheets"]
sheet_path = "sheets-5e3cf7f4981a.json"
sheet_url = "https://docs.google.com/spreadsheets/d/"
website_url = "https://urbantoronto.ca/database/projects/"

In [6]:
credentials = ServiceAccountCredentials.from_json_keyfile_name(sheet_path, SCOPES)

gc = gspread.authorize(credentials)

sheet = gc.open_by_url(sheet_url).sheet1

service = Service(driver_path)
driver = webdriver.Chrome(service=service)

available_ports = [random.randint(4444, 9999) for _ in range(10)]  # 10 random ports

chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Open Chrome in maximized mode

def get_chromedriver_service(port):
    """
    Creates a Service object with a random port from the available list
    """
    available_ports.remove(port)  # Remove used port from list
    return Service(driver_path, port=port)

# Function to scrape the specific data from the project page
def scrape_project_data(driver, sheet, row_number):
    # Wait for the page to load completely
    time.sleep(3)

    # Fetch the first div with class "content-text" (ignore other instances)
    content_text = driver.find_elements(By.CSS_SELECTOR, "div.content-text")[0]

    # Create an empty dictionary to store the project data
    project_data = {}

    # Extract data by searching for the heading and then retrieving the subsequent "project-details"
    try:
        project_data["Address"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Address')]/following-sibling::span").text
    except:
        project_data["Address"] = None

    try:
        project_data["Category"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Category')]/following-sibling::span").text
    except:
        project_data["Category"] = None

    try:
        project_data["Status"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Status')]/following-sibling::span").text
    except:
        project_data["Status"] = None

    try:
        project_data["Number of Buildings"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Number of Buildings')]/following-sibling::span").text
    except:
        project_data["Number of Buildings"] = None

    try:
        project_data["Storeys"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Storeys')]/following-sibling::span").text
    except:
        project_data["Storeys"] = None

    try:
        project_data["Number of Units"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Number of Units')]/following-sibling::span").text
    except:
        project_data["Number of Units"] = None

    try:
        project_data["Developer"] = content_text.find_element(By.XPATH, ".//span[contains(text(),'Developer')]/following-sibling::span/a").text
    except:
        project_data["Developer"] = None

    # Update the Google Sheet with the scraped data
    sheet.update_cell(row_number, 2, project_data["Address"])
    sheet.update_cell(row_number, 3, project_data["Category"])
    sheet.update_cell(row_number, 4, project_data["Status"])
    sheet.update_cell(row_number, 5, project_data["Number of Buildings"])
    sheet.update_cell(row_number, 6, project_data["Storeys"])
    sheet.update_cell(row_number, 7, project_data["Number of Units"])
    sheet.update_cell(row_number, 8, project_data["Developer"])

# Main scraping flow
def main_scraping_flow():
    global available_ports

    # Use a random port for the first driver.get
    service = get_chromedriver_service(random.choice(available_ports))
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        # Load cookies from file
        with open('UT_cookies.json', 'r') as file:
            cookies = json.load(file)

        for cookie in cookies:
            if 'expiry' in cookie:
                del cookie['expiry']
            if 'sameSite' in cookie and cookie['sameSite'] not in ["Strict", "Lax", "None"]:
                del cookie['sameSite']
            try:
                driver.add_cookie(cookie)
            except Exception as e:
                print(f"Error adding cookie: {cookie['name']}, Error: {e}")

        driver.get(website_url)
        time.sleep(10)  # Allow time for the page to load fully

        # Get all the project URLs using the href inside the <a> tag within the list
        project_links = driver.find_elements(By.XPATH, '//*[@id="database_list"]/table[1]/tbody/tr/td/table/tbody/tr/td[2]/div[1]/a')

        # Loop through the project links and get the href attribute (which is the project URL)
        for project_link in project_links:
            try:
                # Check if there are available ports left
                if not available_ports:
                    raise Exception("No more available ports")

                # Use a new random port from the list
                service = get_chromedriver_service(random.choice(available_ports))
                new_driver = webdriver.Chrome(service=service, options=chrome_options)

                project_url = project_link.get_attribute("href")

                # Find the next empty row in the Google Sheet
                next_row = len(sheet.get_all_values()) + 1

                # Store the URL in the 'URLs' column of the Google Sheet
                sheet.update_cell(next_row, 1, project_url)

                new_driver.get(project_url)
                time.sleep(10)

                # Call the scrape_project_data function using the new driver
                scrape_project_data(new_driver, sheet, next_row)

                # Quit the temporary driver after scraping
                new_driver.quit()

            except Exception as e:
                print(f"Error occurred for a project: {e}")

    finally:
        driver.quit()  # Quit the main driver

# Call the main scraping flow
main_scraping_flow()

Error adding cookie: __eoi, Error: Message: invalid cookie domain
  (Session info: chrome=128.0.6613.120)
Stacktrace:
	GetHandleVerifier [0x00007FF7AECEB632+29090]
	(No symbol) [0x00007FF7AEC5E6E9]
	(No symbol) [0x00007FF7AEB1AFF9]
	(No symbol) [0x00007FF7AEBCB414]
	(No symbol) [0x00007FF7AEB966EA]
	(No symbol) [0x00007FF7AEBB65D9]
	(No symbol) [0x00007FF7AEB96493]
	(No symbol) [0x00007FF7AEB609B1]
	(No symbol) [0x00007FF7AEB61B11]
	GetHandleVerifier [0x00007FF7AF00883D+3294125]
	GetHandleVerifier [0x00007FF7AF054423+3604371]
	GetHandleVerifier [0x00007FF7AF04A2E7+3563095]
	GetHandleVerifier [0x00007FF7AEDA6F16+797318]
	(No symbol) [0x00007FF7AEC6986F]
	(No symbol) [0x00007FF7AEC65454]
	(No symbol) [0x00007FF7AEC655E0]
	(No symbol) [0x00007FF7AEC54A7F]
	BaseThreadInitThunk [0x00007FFEE2B87374+20]
	RtlUserThreadStart [0x00007FFEE4ABCC91+33]

Error adding cookie: __gads, Error: Message: invalid cookie domain
  (Session info: chrome=128.0.6613.120)
Stacktrace:
	GetHandleVerifier [0x00007F