In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pathlib
import time
import random
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

In [2]:
PROJECT_ROOT = pathlib.Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)

GLASSDOOR_BASE = "https://www.glassdoor.com"

print("Project root:", PROJECT_ROOT)
print("Data folder:", DATA_RAW)

Project root: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks
Data folder: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw


In [3]:
def start_driver(headless=False):
    """Initialize Chrome driver"""
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
        opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--start-maximized")
    opts.add_argument("--window-size=1280,1000")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver

def smart_scroll(driver, scrolls=3, pause_min=1.0, pause_max=2.0):
    """Scroll down the page to load lazy content"""
    body = driver.find_element(By.TAG_NAME, "body")
    for i in range(scrolls):
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(random.uniform(pause_min, pause_max))
        driver.execute_script("window.scrollBy(0, 300);")
        time.sleep(random.uniform(0.5, 1.0))
    time.sleep(random.uniform(pause_min, pause_max))

def parse_glassdoor_job_card(html: str):
    """Parse a single Glassdoor job card HTML"""
    soup = BeautifulSoup(html, "html.parser")
    
    # Title
    title = None
    title_tag = soup.find('a', {'data-test': 'job-title'})
    if title_tag:
        title = title_tag.get_text(strip=True)
    
    # Company
    company = None
    company_tag = soup.find('div', {'data-test': 'employer-name'})
    if not company_tag:
        company_tag = soup.find('div', class_=lambda x: x and 'EmployerProfile' in x)
    if company_tag:
        company = company_tag.get_text(strip=True)
    
    # Location
    location = None
    loc_tag = soup.find('div', {'data-test': 'emp-location'})
    if loc_tag:
        location = loc_tag.get_text(strip=True)
    
    # Salary
    salary = None
    salary_tag = soup.find('div', {'data-test': 'detailSalary'})
    if salary_tag:
        salary = salary_tag.get_text(strip=True)
    
    # URL
    job_url = None
    link_tag = soup.find('a', {'data-test': 'job-title'})
    if link_tag and link_tag.get('href'):
        job_url = urljoin(GLASSDOOR_BASE, link_tag['href'])
    
    # Attributes
    attributes = []
    rating_tag = soup.find('span', {'data-test': 'rating'})
    if rating_tag:
        attributes.append(f"Rating: {rating_tag.get_text(strip=True)}")
    date_tag = soup.find('div', {'data-test': 'job-age'})
    if date_tag:
        attributes.append(f"Posted: {date_tag.get_text(strip=True)}")
    
    return {
        "title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "url": job_url,
        "attributes": attributes,
    }

In [7]:
def scrape_location(location_name, query="data analyst", pages_to_scroll=3):
    """Scrape Glassdoor for a specific location"""
    print(f"\n{'='*60}")
    print(f"Scraping: {location_name}")
    print(f"{'='*60}\n")
    
    # Build URL
    url = f"https://www.glassdoor.com/Job/jobs.htm?sc.keyword={query}&locT=S&locKeyword={location_name}"
    print(f"Opening: {url}")
    
    # Start driver
    driver = start_driver(headless=False)
    driver.get(url)
    time.sleep(2 + random.random()*1.5)
    
    # Scroll to load more jobs
    print("Scrolling to load jobs...")
    smart_scroll(driver, scrolls=pages_to_scroll)
    
    # Find job cards
    print("Looking for job cards...")
    wait = WebDriverWait(driver, 15)
    selectors = ['li[data-test="jobListing"]', 'li.JobsList_jobListItem', 'div.JobCard']
    
    cards = []
    for sel in selectors:
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
            cards = driver.find_elements(By.CSS_SELECTOR, sel)
            if cards:
                print(f" Found {len(cards)} cards using '{sel}'")
                break
        except:
            continue
    
    if not cards:
        print(" No cards found!")
        driver.quit()
        return []
    
    # Extract HTML
    print("Extracting job data...")
    cards_html = []
    for i, c in enumerate(cards):
        try:
            html = c.get_attribute("outerHTML")
            cards_html.append(html)
        except Exception as e:
            print(f"  Error on card {i}: {e}")
    
    # Parse all cards
    jobs_data = []
    for html in cards_html:
        parsed = parse_glassdoor_job_card(html)
        if parsed['title']:  # Only add if we got a title
            parsed['location_category'] = location_name
            parsed['scrape_date'] = datetime.now().isoformat()
            parsed['source'] = 'glassdoor'
            jobs_data.append(parsed)
    
    print(f"âœ“ Parsed {len(jobs_data)} jobs")
    
    # Show first 3
    print(f"\nFirst 3 jobs:")
    for i, job in enumerate(jobs_data[:3]):
        print(f"  {i+1}. {job['title']} at {job['company']}")
        if job['salary']:
            print(f"      {job['salary']}")
    
    # Close browser
    driver.quit()
    print(f"\nâœ“ Browser closed for {location_name}")
    
    return jobs_data

In [5]:
california_jobs = scrape_location("California", query="data analyst", pages_to_scroll=3)


Scraping: California

Opening: https://www.glassdoor.com/Job/jobs.htm?sc.keyword=data analyst&locT=S&locKeyword=California
Scrolling to load jobs...
Looking for job cards...
âœ“ Found 30 cards using 'li[data-test="jobListing"]'
Extracting job data...
âœ“ Parsed 30 jobs

First 3 jobs:
  1. Data Analyst, Specialist at Vanguard3.7
     ðŸ’° $62K - $90K(Glassdoor est.)
  2. Sr Business Analyst at Ajinomoto Foods North America3.9
     ðŸ’° $120K - $130K(Employer provided)
  3. People Data Analyst at Central Health3.4
     ðŸ’° $53K - $77K(Glassdoor est.)

âœ“ Browser closed for California


In [6]:
ca_file = DATA_RAW / "glassdoor_jobs_california.json"
with open(ca_file, "w", encoding="utf-8") as f:
    json.dump(california_jobs, f, indent=2)

print(f"Saved {len(california_jobs)} California jobs to: {ca_file}")

Saved 30 California jobs to: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw\glassdoor_jobs_california.json
