In [1]:
import json
import time
import random
from collections import deque
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options


# Setup Chrome options
options = Options()

# Setup Chrome and WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [2]:

def load_state():
    visited_urls = set()
    url_queue = deque()
    try:
        with open('visited.jsonl', 'r') as file:
            for line in file:
                data = json.loads(line)
                visited_urls.add(data["url"])
        with open('queue.jsonl', 'r') as file:
            for line in file:
                url_queue.append(line.strip())
    except FileNotFoundError:
        pass  # No file exists yet, start with an empty set and queue
    return visited_urls, url_queue

def save_state(url_queue):
    with open('queue.jsonl', 'w') as file:
        for url in url_queue:
            file.write(url + '\n')

def crawl(start_url):
    visited_urls, url_queue = load_state()
    if not visited_urls:
        url_queue.append(start_url)
    
    while url_queue:
        current_url = url_queue.popleft()
        if current_url in visited_urls:
            continue
        visited_urls.add(current_url)
        
        driver.get(current_url)
        driver.implicitly_wait(15)
        # time.sleep(random.uniform(1, 5))
        html_content = driver.page_source
        save_html(current_url, html_content)
        
        links = driver.find_elements(By.TAG_NAME, 'a')
        for link in links:
            href = link.get_attribute('href')
            if href and href.startswith("https://leg.mt.gov/bills/mca") and href not in visited_urls:
                url_queue.append(href)
        
        save_state(url_queue)  # Save the queue state after processing each URL

def save_html(url, html):
    with open('visited.jsonl', 'a') as file:
        data = {"url": url, "html": html}
        file.write(json.dumps(data) + '\n')

# Start crawling from the initial page
crawl("https://leg.mt.gov/bills/mca/index.html")

# Close the browser
driver.quit()