## Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import csv

## Scraping URLs with Selenium

In [5]:
# Specify the new base URL
base_url = "https://www.dice.com/jobs?location=USA&latitude=37.09024&longitude=-95.712891&countryCode=US&locationPrecision=Country&radius=30&radiusUnit=mi&page={}&pageSize=20&filters.postedDate=SEVEN&filters.employmentType=FULLTIME&language=en"

# Set up Chrome WebDriver with options, including incognito mode
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no visible browser window)
chrome_options.add_argument("--incognito")  # Enable incognito mode

# Specify the path to your Chrome WebDriver executable
chrome_driver_path = r"C:\Users\sarvin.farhad\Data-Science\Projects\EquiJob\task-1-data-collection\chromedriver.exe"

# Create a WebDriver instance
service = ChromeService(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

job_hrefs = set()  # To store all job hrefs

# Loop through pages
page_number = 1
while True:
    url = base_url.format(page_number)
    print(f"Visiting page {page_number}: {url}")

    # Navigate to the URL
    driver.get(url)

    # Wait for the parent element to be present using XPATH with an extended timeout
    wait = WebDriverWait(driver, 15)
    try:
        parent_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="searchDisplay-div"]/div[3]/dhi-search-cards-widget/div')))
    except TimeoutException:
        # If the parent element is not found, it means there are no more pages
        print(f"No more pages, stopping on page {page_number}. Total job hrefs: {len(job_hrefs)}")
        break

    # Find all child elements with the class 'card-title-link bold'
    child_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'card-title-link')))

    # Extract the href attribute from each child element and add them to the set
    for element in child_elements:
        href = element.get_attribute('href')
        if href:
            job_hrefs.add(href)

    # Increment the page number for the next iteration
    page_number += 1

    # Check if the "Next" button is disabled
    try:
        next_button = driver.find_element(By.CLASS_NAME, 'pagination-next')
        if "disabled" in next_button.get_attribute("class"):
            # If the "Next" button is disabled, it means there are no more pages
            print(f"Reached the last page, stopping on page {page_number - 1}. Total job hrefs: {len(job_hrefs)}")
            break
    except NoSuchElementException:
        print("Next button not found. Exiting.")
        break

# Print the scraped job hrefs
for href in job_hrefs:
    print(href)

# Close the webdriver
driver.quit()


Visiting page 1: https://www.dice.com/jobs?location=USA&latitude=37.09024&longitude=-95.712891&countryCode=US&locationPrecision=Country&radius=30&radiusUnit=mi&page=1&pageSize=20&filters.postedDate=SEVEN&filters.employmentType=FULLTIME&language=en
Visiting page 2: https://www.dice.com/jobs?location=USA&latitude=37.09024&longitude=-95.712891&countryCode=US&locationPrecision=Country&radius=30&radiusUnit=mi&page=2&pageSize=20&filters.postedDate=SEVEN&filters.employmentType=FULLTIME&language=en
Visiting page 3: https://www.dice.com/jobs?location=USA&latitude=37.09024&longitude=-95.712891&countryCode=US&locationPrecision=Country&radius=30&radiusUnit=mi&page=3&pageSize=20&filters.postedDate=SEVEN&filters.employmentType=FULLTIME&language=en
Visiting page 4: https://www.dice.com/jobs?location=USA&latitude=37.09024&longitude=-95.712891&countryCode=US&locationPrecision=Country&radius=30&radiusUnit=mi&page=4&pageSize=20&filters.postedDate=SEVEN&filters.employmentType=FULLTIME&language=en
Visiting

## Save Scraped URLs

In [6]:
# Convert the set of hrefs to a list
job_hrefs_list = list(job_hrefs)

# Define the CSV file name
csv_file_name = "dice_scraped_7_days_full_time.csv"

# Write the list to a CSV file
with open(csv_file_name, mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Link'])  # Write a header row if needed
    for href in job_hrefs_list:
        writer.writerow([href])

print(f"CSV file '{csv_file_name}' has been saved with {len(job_hrefs_list)} links.")

CSV file 'dice_scraped_7_days_full_time.csv' has been saved with 10000 links.
