In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pathlib
import time
import random
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

In [39]:
PROJECT_ROOT = pathlib.Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)

GLASSDOOR_BASE = "https://www.glassdoor.com"

print("Project root:", PROJECT_ROOT)
print("Data folder:", DATA_RAW)

Project root: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks
Data folder: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw


In [63]:
def scrape_location(location_name, query="data analyst", pages_to_scroll=3, max_jobs=50, filter_location=True):
    """Scrape Glassdoor for a specific location with full job details"""
    print(f"\n{'='*60}")
    print(f"Scraping: {location_name}")
    print(f"{'='*60}\n")
    
    # Add location to query
    search_query = f"{query} {location_name}"
    
    # Build URL
    url = f"https://www.glassdoor.com/Job/jobs.htm?sc.keyword={search_query}&locT=S&locKeyword={location_name}"
    print(f"Opening: {url}")
    
    # Start driver
    driver = start_driver(headless=False)
    driver.get(url)
    time.sleep(4)
    
    # Scroll to load initial jobs
    print("Scrolling to load jobs...")
    smart_scroll(driver, scrolls=pages_to_scroll)
    
    # Click "Show more jobs" button multiple times to load more jobs
    print("Loading more jobs...")
    for attempt in range(5):  # Try to click "Show more jobs" 5 times
        try:
            show_more_jobs_btn = driver.find_element(By.XPATH, "//button[contains(text(), 'Show more jobs')]")
            if show_more_jobs_btn.is_displayed():
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", show_more_jobs_btn)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", show_more_jobs_btn)
                print(f"  Clicked 'Show more jobs' button ({attempt + 1}/5)")
                time.sleep(3)
                # Scroll a bit after clicking
                smart_scroll(driver, scrolls=2)
        except:
            print(f"  No more 'Show more jobs' button")
            break
    
    # Find job cards
    print("\nLooking for job cards...")
    wait = WebDriverWait(driver, 15)
    selectors = ['li[data-test="jobListing"]', 'li.JobsList_jobListItem']
    
    cards = []
    for sel in selectors:
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
            cards = driver.find_elements(By.CSS_SELECTOR, sel)
            if cards:
                print(f"✓ Found {len(cards)} total job cards")
                break
        except:
            continue
    
    if not cards:
        print("❌ No cards found!")
        driver.quit()
        return []
    
    # Limit number of jobs
    cards_to_scrape = cards[:max_jobs]
    print(f"Will scrape {len(cards_to_scrape)} jobs\n")
    
    # Parse all cards
    jobs_data = []
    for i, card in enumerate(cards_to_scrape):
        try:
            print(f"[{i+1}/{len(cards_to_scrape)}] ", end='', flush=True)
            
            # Get basic info from card
            card_html = card.get_attribute("outerHTML")
            parsed = parse_glassdoor_job_card(card_html)
            
            if not parsed['title']:
                print("Skipped (no title)")
                continue
            
            # Filter by location
            if filter_location:
                job_location = parsed.get('location', '').lower()
                if location_name.lower() == "california":
                    if 'ca' not in job_location and 'california' not in job_location:
                        print(f"❌ Not CA: {parsed['location']}")
                        continue
                elif location_name.lower() == "new york":
                    if 'ny' not in job_location and 'new york' not in job_location:
                        print(f"❌ Not NY: {parsed['location']}")
                        continue
            
            print(f"{parsed['title'][:45]} - ", end='', flush=True)
            
            # Click the card
            try:
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", card)
                time.sleep(0.8)
                driver.execute_script("arguments[0].click();", card)
                time.sleep(3)
            except:
                print("❌ Can't click")
                continue
            
            # Click "Show more" button to expand description
            try:
                time.sleep(1)
                show_btns = driver.find_elements(By.XPATH, "//button[contains(text(), 'Show more')]")
                for btn in show_btns:
                    try:
                        if btn.is_displayed():
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
                            time.sleep(0.5)
                            driver.execute_script("arguments[0].click();", btn)
                            time.sleep(2)
                            print("expanded - ", end='', flush=True)
                            break
                    except:
                        continue
            except:
                pass
            
            # Get full job description
            try:
                full_text = ""
                
                # Try to get the job description
                try:
                    job_panel = driver.find_element(By.CSS_SELECTOR, 'div[class*="JobDetails"]')
                    full_text = job_panel.text
                except:
                    pass
                
                if not full_text or len(full_text) < 100:
                    try:
                        right_panel = driver.find_element(By.XPATH, "//div[contains(@class, 'JobDetails')]")
                        full_text = right_panel.text
                    except:
                        pass
                
                if not full_text or len(full_text) < 50:
                    print("⚠️ No description")
                    parsed['job_description'] = None
                    parsed['requirements'] = None
                    parsed['benefits'] = None
                    parsed['responsibilities'] = None
                    parsed['qualifications'] = None
                    parsed['skills_mentioned'] = []
                else:
                    parsed['job_description'] = full_text
                    
                    # Extract sections
                    lines = full_text.split('\n')
                    
                    requirements = []
                    benefits = []
                    responsibilities = []
                    qualifications = []
                    
                    current_section = None
                    
                    for line in lines:
                        line_stripped = line.strip()
                        if not line_stripped:
                            continue
                        
                        line_lower = line_stripped.lower()
                        
                        # Detect section headers
                        if any(kw in line_lower for kw in ['requirements:', 'required qualifications:', 'minimum qualifications:', 'what you need:', 'what you will need:']):
                            current_section = 'requirements'
                            continue
                        elif any(kw in line_lower for kw in ['qualifications:', 'preferred qualifications:', 'nice to have:', 'preferred skills:', 'preferred:']):
                            current_section = 'qualifications'
                            continue
                        elif any(kw in line_lower for kw in ['responsibilities:', 'what you will do:', 'duties:', 'the position:', 'your role:']):
                            current_section = 'responsibilities'
                            continue
                        elif any(kw in line_lower for kw in ['benefits:', 'what we offer:', 'perks:', 'compensation:', 'why join']):
                            current_section = 'benefits'
                            continue
                        elif any(kw in line_lower for kw in ['about us', 'about the company', 'about', 'the department']):
                            current_section = None
                            continue
                        
                        # Add content to current section
                        if current_section == 'requirements' and len(line_stripped) > 10:
                            requirements.append(line_stripped)
                        elif current_section == 'qualifications' and len(line_stripped) > 10:
                            qualifications.append(line_stripped)
                        elif current_section == 'responsibilities' and len(line_stripped) > 10:
                            responsibilities.append(line_stripped)
                        elif current_section == 'benefits' and len(line_stripped) > 10:
                            benefits.append(line_stripped)
                    
                    # Store sections
                    parsed['requirements'] = '\n'.join(requirements[:20]) if requirements else None
                    parsed['qualifications'] = '\n'.join(qualifications[:20]) if qualifications else None
                    parsed['responsibilities'] = '\n'.join(responsibilities[:20]) if responsibilities else None
                    parsed['benefits'] = '\n'.join(benefits[:15]) if benefits else None
                    
                    # Extract skills
                    skills = []
                    skill_keywords = ['python', 'sql', 'excel', 'tableau', 'power bi', 
                                    'statistics', 'machine learning', 'aws', 'azure', 'pandas']
                    
                    full_lower = full_text.lower()
                    for skill in skill_keywords:
                        if skill in full_lower:
                            skills.append(skill.title())
                    
                    parsed['skills_mentioned'] = list(set(skills))
                    
                    sections_found = []
                    if requirements:
                        sections_found.append('req')
                    if qualifications:
                        sections_found.append('qual')
                    if responsibilities:
                        sections_found.append('resp')
                    if benefits:
                        sections_found.append('ben')
                    
                    if sections_found:
                        print(f"✓ [{','.join(sections_found)}]")
                    else:
                        print(f"✓ {len(full_text)} chars")
                
            except Exception as e:
                print(f"⚠️ Error: {str(e)[:40]}")
                parsed['job_description'] = None
                parsed['requirements'] = None
                parsed['qualifications'] = None
                parsed['responsibilities'] = None
                parsed['benefits'] = None
                parsed['skills_mentioned'] = []
            
            # Add metadata
            parsed['location_category'] = location_name
            parsed['scrape_date'] = datetime.now().isoformat()
            parsed['source'] = 'glassdoor'
            
            jobs_data.append(parsed)
            
        except Exception as e:
            print(f"❌ Error: {str(e)[:50]}")
            continue
    
    print(f"\n{'='*60}")
    print(f"✓ Scraped {len(jobs_data)} jobs from {location_name}")
    print(f"{'='*60}")
    
    driver.quit()
    return jobs_data

In [64]:
california_jobs = scrape_location("California", query="data analyst", pages_to_scroll=3, max_jobs=50)


Scraping: California

Opening: https://www.glassdoor.com/Job/jobs.htm?sc.keyword=data analyst California&locT=S&locKeyword=California
Scrolling to load jobs...
Loading more jobs...
  No more 'Show more jobs' button

Looking for job cards...
✓ Found 30 total job cards
Will scrape 30 jobs

[1/30] ❌ Not CA: Baltimore, MD
[2/30] ❌ Not CA: Farmington Hills, MI
[3/30] ❌ Not CA: Arlington, VA
[4/30] Board Certified Behavior Analyst (BCBA) – Cen - ✓ 2836 chars
[5/30] Associate Data Analyst - ✓ 2836 chars
[6/30] Clinical Supervisor, BCBA - ✓ 2836 chars
[7/30] ❌ Not CA: Baltimore, MD
[8/30] Research Data Analyst - ✓ 2836 chars
[9/30] Data Analyst - ✓ 2836 chars
[10/30] ❌ Not CA: United States
[11/30] ❌ Not CA: New York, NY
[12/30] Clinical Supervisor, BCBA - ✓ 2836 chars
[13/30] Board Certified Behavior Analyst (BCBA) - Cen - ✓ 2836 chars
[14/30] Clinical Supervisor, BCBA - ✓ 2836 chars
[15/30] Board Certified Behavior Analyst (BCBA) - Cen - ✓ 2836 chars
[16/30] Senior Behavior Analyst (BCBA) -

In [65]:
ca_file = DATA_RAW / "glassdoor_jobs_california.json"
with open(ca_file, "w", encoding="utf-8") as f:
    json.dump(california_jobs, f, indent=2)

print(f"Saved {len(california_jobs)} California jobs to: {ca_file}")

Saved 19 California jobs to: c:\Users\rachi\OneDrive\Desktop\school\Sta160-Group-Project\notebooks\data\raw\glassdoor_jobs_california.json
