In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pathlib
import time
import random
import json
import os

PROJECT_ROOT = pathlib.Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data folder:", DATA_RAW)


Project root: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks
Data folder: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw


In [3]:
def start_driver(headless=False):
    from selenium.webdriver.chrome.options import Options
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
        opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--start-maximized")
    opts.add_argument("--window-size=1200,1000")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver

driver = start_driver(headless=False)   
wait = WebDriverWait(driver, 15)


In [5]:
query = "data+analyst"            
location = "California"          
pages_to_scroll = 3             
url = f"https://www.indeed.com/jobs?q={query}&l={location}"
print("Opening:", url)
driver.get(url)

# Wait a little for the page to settle
time.sleep(2 + random.random()*1.5)


Opening: https://www.indeed.com/jobs?q=data+analyst&l=California


In [6]:
def smart_scroll(driver, scrolls=3, pause_min=1.0, pause_max=2.0):
    body = driver.find_element(By.TAG_NAME, "body")
    for i in range(scrolls):
        # Scroll down a bunch
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(random.uniform(pause_min, pause_max))
        # Try smaller scrolls to load lazy content
        driver.execute_script("window.scrollBy(0, 300);")
        time.sleep(random.uniform(0.5, 1.0))
    # final wait
    time.sleep(1.0 + random.random()*1.5)

smart_scroll(driver, scrolls=pages_to_scroll, pause_min=1.2, pause_max=2.5)


In [7]:
def click_next_pages(driver, n_pages=3):
    for i in range(n_pages-1):  # already loaded page 1
        try:
            next_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[aria-label='Next']")))
            next_btn.click()
            time.sleep(2 + random.random()*1.5)
            smart_scroll(driver, scrolls=2)
        except Exception as e:
            print("Couldn't click next:", e)
            break

click_next_pages(driver, n_pages=3)


Couldn't click next: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xc94093
	0xc940d4
	0xa9e71d
	0xaea03d
	0xaea41b
	0xb317f2
	0xb0c954
	0xb2ee17
	0xb0c706
	0xadda30
	0xaded54
	0xf05744
	0xf0091a
	0xcbc322
	0xcac458
	0xcb31dd
	0xc9c408
	0xc9c5cc
	0xc8675a
	0x759f5d49
	0x7742d6db
	0x7742d661



In [8]:
# Wait for job cards to exist (tune the selector if needed)
try:
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.cardOutline, div.job_seen_beacon, div.job")))
except Exception as e:
    print("Timed out waiting for cards:", e)

# get all job card elements (try multiple fallback selectors)
selectors = [
    "div.cardOutline",
    "div.job_seen_beacon",
    "div.job",                # mobile-style blocks
]

cards = []
for sel in selectors:
    found = driver.find_elements(By.CSS_SELECTOR, sel)
    if found:
        cards = found
        print(f"Using selector '{sel}' → found {len(found)} cards")
        break

print("Total job card elements found:", len(cards))

# Save outerHTML of each card into a list
cards_html = []
for i, c in enumerate(cards):
    try:
        html = c.get_attribute("outerHTML")
        cards_html.append(html)
    except Exception as e:
        print("error getting outerHTML for card", i, e)

# Save first job-card outerHTML to a file for inspection
if cards_html:
    sample_file = DATA_RAW / "indeed_first_card.html"
    sample_file.write_text(cards_html[0], encoding="utf-8")
    print("Saved first card outerHTML to:", sample_file)

# Save complete page source too
page_file = DATA_RAW / "indeed_full_page.html"
page_file.write_text(driver.page_source, encoding="utf-8")
print("Saved full page HTML to:", page_file)

# Save all cards as JSON with index
cards_json_file = DATA_RAW / "indeed_cards_outerhtml.json"
with open(cards_json_file, "w", encoding="utf-8") as f:
    json.dump([{"index": i, "outerHTML": h} for i, h in enumerate(cards_html)], f, indent=2)
print("Saved all card outerHTML to:", cards_json_file)

# Print first 1-3 card outerHTML snippets (so you can copy-paste quickly)
for i in range(min(3, len(cards_html))):
    print("\n\n--- CARD", i, "START ---\n")
    print(cards_html[i][:4000])   # print up to first 4000 chars to avoid overwhelming output
    print("\n--- CARD", i, "END ---\n")


Using selector 'div.cardOutline' → found 16 cards
Total job card elements found: 16
Saved first card outerHTML to: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw\indeed_first_card.html
Saved full page HTML to: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw\indeed_full_page.html
Saved all card outerHTML to: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw\indeed_cards_outerhtml.json


--- CARD 0 START ---

<div class="cardOutline tapItem dd-privacy-allow result job_bd03edd8cc350b79 maybeSponsoredJob resultWithShelf sponTapItem desktop hasSection hasSection-default nonRecommendation-section vjs-highlight css-8ftfna eu4oa1w0"><div class="slider_container css-weo834 eu4oa1w0" data-testid="slider_container"><div class="slider_list css-1bej0z4 eu4oa1w0"><div data-testid="slider_item" class="slider_item css-17bghu4 eu4oa1w0"><div data-testid="fade-in-wrapper" class="css-u74ql7 eu4oa1w0"><div class="j

In [9]:
driver.quit()   


In [10]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin

INDEED_BASE = "https://www.indeed.com"


def parse_indeed_job_card(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # ---------- TITLE ----------
    title_tag = soup.select_one("[data-testid='jobTitle']") \
        or soup.select_one("h2.jobTitle span")
    title = title_tag.get_text(strip=True) if title_tag else None

    # ---------- COMPANY ----------
    company_tag = soup.select_one("[data-testid='company-name']")
    company = company_tag.get_text(strip=True) if company_tag else None

    # ---------- LOCATION ----------
    loc_tag = soup.select_one("[data-testid='text-location']")
    location = loc_tag.get_text(strip=True) if loc_tag else None

    # ---------- SALARY ----------
    salary = None

    # 1. Sponsored / paid listing salary (like your new example)
    salary_block = soup.select_one("div.css-1b7u11v")
    if salary_block:
        salary = " ".join([s.get_text(strip=True) for s in salary_block.select("span")])

    # 2. General salary listing style (fall-back)
    if salary is None or salary == "":
        alt_salary = soup.find("span", string=lambda t: t and ("$" in t))
        if alt_salary:
            # Get siblings too (like "$100k–$150k a year")
            pieces = [alt_salary.get_text(strip=True)]
            for sib in alt_salary.find_next_siblings("span"):
                pieces.append(sib.get_text(strip=True))
            salary = " ".join(pieces)

    # 3. "Pay information not provided" → treat as None
    if salary and "not provided" in salary.lower():
        salary = None

    # ---------- LINK ----------
    link_tag = soup.select_one("a.jcs-JobTitle")
    job_url = urljoin(INDEED_BASE, link_tag["href"]) if link_tag else None

    # ---------- ATTRIBUTE TAGS ----------
    attributes = [li.get_text(" ", strip=True) 
                  for li in soup.select("ul.metadataContainer li")]

    return {
        "title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "url": job_url,
        "attributes": attributes,
    }


In [12]:
parsed = parse_indeed_job_card(cards_html[1])
parsed


{'title': 'Data Logistics Analyst',
 'company': 'Crossroads Talent Solutions',
 'location': 'Alameda, CA 94501',
 'salary': None,
 'url': 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Db88DRZEY47VFYjuvBy8lSor1Dh4t-PWj2tYvf7Ywj9KPPM6PZgaq1nzWSyuKvy4N74MlDEFHFdd_tfsPK4Oj2av9hBA4W8Nqw9s9F4V8hY2NH0l8v5DiELaC_lku23Kq1erjpvQjjHN9vWV9PSCnsRRI-mrQZ0Wq2lmO1ZdcuBVPg1zuS-QkMidqlaVWz7V426F6D1xU6e_NsRFst-Tjh9hZDQEIRk3zKKuLTXevfldjNQfSRCXH24SLiYzRPMb9aNxzQxNmhBJYshfCAuuCEGhHKjQU-fhQEC2nqW-NoY2hrinm_WLj78KYFTR9KKYvOjft8ZGW4b6v5G6ZRqxUbppiGEiPjCigze8JpUftLylnRnmNcXGJrfXewpatb8_0IGmn35fvxdsaEGP9SvJmEtLGQPPr3EvP6msyBnMuxwPSbv2c6t8jA6JosjT0Zq0sYW58Zu7eo70eFuzXwuyWYaBNReBg-Yy8Rp-0t4VTozrsqlbaE2zLKgUjThL4l1LGYF_0IjRRzwIHKeHGXMLNU6NcsdIUYRkY1Ceo_x-VxJqLqaMUPjjITSdB3nI8eYr-EHMoLAQSSTZ1D86IEoX1nSp91u9X2K0yyWPkPaeimWfv6W9tq2ouQ2GEPMev22bJsOaNCo948776limaMrbt7UMo-JpIUOJkmB8ZgQ07w00YoEv1ApN7qCde_bnCOeGWXQ37rYC-jLAsjojX3ckdb_ExCGwIkPhc=&xkcb=SoAT6_M3qE9EcIxSjp0KbzkdCdPP&camk=UoKtGZLa3XIi6F045pqV4g==&p=1&fv