In [30]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import logging
from urllib.parse import urljoin

In [31]:
logging.basicConfig(filename = 'scraper_errors.log', level = logging.ERROR)

In [32]:
# Function to search for keywords within text
def search_for_keywords(text):
    keywords = ["survey", "program assessment", "program evaluation"]
    # Debug: Print a snippet of the text being searched
    print("Searching the following text snippet for keywords:")
    print(text[:500])  # Print the first 500 characters of the text
    
    found = any(re.search(rf"\b{kw}\b", text, re.IGNORECASE) for kw in keywords)
    
    if found:
        print("Keywords found!")
    else:
        print("No keywords found in this snippet.")
        
    return found

In [33]:
def scrape_and_get_links(url):
    try:
        response = requests.get(url, verify=False)  # Temporarily bypass SSL verification
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()

        # Search the page content for keywords
        if search_for_keywords(text):
            print(f"Keywords found on {url}")
            return url, [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]

        # Return all found links (whether or not the keywords were found)
        return None, [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
    
    except Exception as e:
        logging.error(f"Failed to scrape {url}: {e}", exc_info=True)
        return None, []

In [34]:
# Main function to start crawling
def crawl_website(landing_page_url, max_depth=100):
    queue = [(landing_page_url, 0)]  # Store URLs with depth level
    visited = set()  # To avoid revisiting the same page
    results = []

    while queue:
        current_url, depth = queue.pop(0)  # Get the next URL and its depth

        if current_url not in visited and depth <= max_depth:
            visited.add(current_url)  # Mark as visited

            print(f"Scraping: {current_url} at depth {depth}")

            # Scrape the current page and get all links from it
            found_url, links = scrape_and_get_links(current_url)
            if found_url:
                results.append(found_url)  # Add to results if keywords were found

            # Add new links to the queue, incrementing the depth
            for link in links:
                if link not in visited:  # Avoid adding already visited links
                    queue.append((link, depth + 1))

    # Save results to CSV
    with open('scrape_results.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["URL"])
        writer.writerows([[url] for url in results])

    print(f"Scraping complete. {len(results)} pages found with keywords.")

In [37]:
landing_page_url = 'https://vaww.va.gov/health/programs.asp'
crawl_website(landing_page_url, max_depth = 5)

Scraping: https://vaww.va.gov/health/programs.asp at depth 0
Scraping complete. 0 pages found with keywords.
