In [343]:
from selenium import webdriver
from time import sleep
import numpy as np
import re
import json
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 15.4 ms


In [None]:
"""
Workflow:
- Since a Glassdoor search only allows you to view the first 900 results, the overall US job search must be segmented by state
- For states with more than 900 results, the search is segmented by industry
- The scraping is carried out one state at a time, looking through each page of results, and going through each job posting
- The job title, job desciption, salary, and company rating are all extracted for each job
- Random sleep is used between requests, otherwise reCAPTCHA is activated and the remaining search fails
- The scraping combines a selenium chrome driver, url manipulation, and html element searching
"""

In [540]:
# Function to sleep for a random amount of time to make requests seem like they are not from a bot
def ran_sleep(multiple,base=0):
    return sleep(base+(np.random.rand()*2)*multiple)

time: 2.9 ms


In [583]:
# Start a chrome driver
def initialize_browser(driver='/Users/JasonKatz/Applications/chromedriver'):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--profile-directory=Default')
    chrome_options.add_argument("--disable-plugins-discovery")
    chrome_options.add_argument("--start-maximized")
    return webdriver.Chrome(driver, chrome_options=chrome_options)

time: 52.1 ms


In [584]:
# Using a browser, navigate to Glassdoor and search for a keyword and a location
def initialize_glassdoor_search(browser, location, keyword="Data Scientist"):
    website = "https://www.glassdoor.com/index.htm"
    browser.get(website)
    ran_sleep(2,base=1)
    job_element = browser.find_element_by_id("KeywordSearch")
    ran_sleep(.1)
    location_element = browser.find_element_by_id("LocationSearch")
    ran_sleep(.1)
    job_element.clear()
    ran_sleep(.5)
    job_element.send_keys(keyword)
    ran_sleep(.5)
    location_element.clear()
    ran_sleep(.5)
    location_element.send_keys(location)
    ran_sleep(.5)
    search = browser.find_element_by_id("HeroSearchButton")
    ran_sleep(.1)
    search.click()
    ran_sleep(1,base=1)
    browser.get(browser.current_url)
    ran_sleep(1,base=1)

time: 44.9 ms


In [669]:
# Using a browser on a Glassdoor search results page, find the number of pages of search results
def get_max_pages(browser):
    try:
        pages_column = browser.find_element_by_id('MainCol')
        ran_sleep(.1)
        num_pages_text = pages_column.find_elements_by_class_name("cell")[0].text
        ran_sleep(.1)
        regex_match = re.search("of", num_pages_text)
        return np.minimum(np.int(num_pages_text[regex_match.span()[1]+1:]), 30)
    except:
        tot_jobs = get_num_jobs(browser)
        if int(tot_jobs) <= 30:
            return 1
        else:
            print("failed to find number of pages")
            print(browser.current_url)
            return np.minimum(np.ceil(tot_jobs/30), 30)

time: 34.8 ms


In [586]:
# Using a browser on a Glassdoor search results page, get all the urls of the search results
def get_job_links(browser):
    try:
        job_links = browser.find_elements_by_class_name('jobLink')
        ran_sleep(.1)
        links = []
        job_ids = []
        for job in job_links:
            link = job.get_attribute('href')
            ran_sleep(.1)
            if link[-10:] not in job_ids:
                job_ids.append(link[-10:])
                links.append(link)
        return links
    except:
        print("failed to find job links")
        print(browser.current_url)
        return []

time: 7.82 ms


In [587]:
# Using a browser on a Glassdoor job posting, get the job title
def get_job_title(browser):
    try:
        job_title_element = browser.find_elements_by_tag_name("h2")[0]
        ran_sleep(.1)
        job_title_text = job_title_element.get_attribute("innerHTML")
        ran_sleep(.1)
        return job_title_text
    except:
        return ""

time: 11.1 ms


In [588]:
# Using a browser on a Glassdoor job posting, get the job description
def get_job_description(browser):
    try:
        description_element = browser.find_element_by_class_name("jobDescriptionContent")
        ran_sleep(.1)
        description = description_element.get_attribute("innerHTML")
        ran_sleep(.1)
        return description
    except:
        return ""

time: 8.74 ms


In [600]:
# Using a browser on a Glassdoor job posting, get the job salary
def get_median_salary(browser):
    try:
        salary_text = browser.find_elements_by_class_name("salEst")[1].text
        ran_sleep(.1)
        regex_match = re.search("/", salary_text)
        salary = np.int(salary_text[1:regex_match.span()[0]].replace(",", ""))
        if salary < 1000:
            return salary * 2000
        else:
            return salary
    except:
        return np.nan

time: 16.3 ms


In [601]:
# Using a browser on a Glassdoor job posting, get the company rating
def get_company_rating(browser):
    try:
        rating_element = browser.find_element_by_class_name("ratingNum")
        ran_sleep(.1)
        rating_text = rating_element.get_attribute("innerHTML")
        ran_sleep(.1)
        return np.float(rating_text)
    except:
        return np.nan

time: 24.8 ms


In [602]:
# Using a browser on a Glassdoor search results page, get the total number of jobs returned
def get_num_jobs(browser):
    try:
        num_jobs_element = browser.find_element_by_class_name("jobsCount")
        ran_sleep(.1)
        num_jobs_text = num_jobs_element.get_attribute("innerHTML")
        ran_sleep(.1)
        regex_match = re.search("&", num_jobs_text)
        return num_jobs_text[:regex_match.span()[0]]
    except:
        return "unknown number of"

time: 11.1 ms


In [603]:
# Using a browser on a Glassdoor search results page, get the ids of the different industries available to segment by
def get_industry_ids(browser):
    try:
        filters = browser.find_element_by_class_name("selectContainer")
        ran_sleep(.1)
        extra_filter = browser.find_elements_by_class_name("filter")[-1]
        ran_sleep(.1)
        extra_filter.click()
        ran_sleep(.5)
        filter_options = browser.find_element_by_class_name("moreFlyout")
        ran_sleep(.1)
        industry_filter = filter_options.find_elements_by_class_name("header")[2]
        ran_sleep(.1)
        industry_filter.click()
        ran_sleep(.5)
        industry_options = filter_options.find_elements_by_tag_name('li')
        all_industries = industry_options[-1]
        all_industries_text = all_industries.text
        ran_sleep(.1)
        regex_match = re.search("\(", all_industries_text)
        num_jobs = all_industries_text[regex_match.span()[1]:-1]
        industry_options = industry_options[:-1]
        ran_sleep(.1)
        industries = []
        for industry in industry_options:
            industry_id = industry.get_attribute("value")
            industries.append("&industryId=" + industry_id)
            ran_sleep(.1)
        return industries, num_jobs
    except:
        print("Could not extract industry ids")
        print(browser.current_url)
        return [], 0

time: 53.7 ms


In [692]:
# Using a browser on a Glassdoor search, scrape all jobs for that search 
def scrape_jobs(browser, outfile):
    
    # Initialize scraping
    base_url = browser.current_url[:-4]
    ran_sleep(1,base=1)
    browser.get(base_url + ".htm?radius=0&fromAge=30")
    ran_sleep(2,base=1)
    data = []
    jobs_scraped = 0
    
    # Get all the industry ids
    industries, tot_jobs = get_industry_ids(browser)
    
    # If less than 900 jobs, just scrape all, otherwise segment by industry
    if int(tot_jobs.replace(",", "")) <= 900:
        industries = [""]
        tot_jobs = get_num_jobs(browser)
        
    print("Beginning scrape of {} jobs".format(tot_jobs))
    for industry in industries:
        print("Beginning industry {}".format(industry[-4:]))
        ran_sleep(5,base=10)
        browser.get(base_url + ".htm?radius=0&fromAge=30" + industry)
        ran_sleep(2,base=1)
        num_pages = get_max_pages(browser)
        
        # Navigate through each search results page
        for page_num in range(num_pages):
            ran_sleep(2.5,base=5)

            try:
                browser.get(base_url + "_IP" + str(page_num+1) + ".htm?radius=0&fromAge=30" + industry)
                ran_sleep(.5,base=1)
                job_links = get_job_links(browser)
            except:
                print("error with finding job links")
                print(browser.current_url)
                job_links = []
            
            # Naviagte to each job posting in the results page
            for link in job_links:
                try:
                    browser.get(link)
                    ran_sleep(1,base=1.5)
                    job_title = get_job_title(browser)
                    job_description = get_job_description(browser)
                    median_salary= get_median_salary(browser)
                    company_rating = get_company_rating(browser)
                    data.append({"job_title":job_title, "job_description":job_description, "median_salary":median_salary, 
                                 "company_rating":company_rating})
                    jobs_scraped += 1
                    
                except:
                    print("error with job link")
                    print(browser.current_url)
                    jobs_scraped += 0
                
            with open(outfile, 'w') as file:
                json.dump(data, file)

            print("Finished scraping job {} of {}".format(jobs_scraped, tot_jobs))

time: 185 ms


In [None]:
### All code below is for the scraping of each US state, one by one

In [751]:
browser = initialize_browser()

time: 3.76 s


In [596]:
initialize_glassdoor_search(browser, 'california' + ', us')
scrape_jobs(browser, "Job_Data/california.json")

Beginning scrape of 5,158 jobs
Beginning industry 1000
Finished scraping job 30 of 3759
Finished scraping job 60 of 3759
Finished scraping job 63 of 3759
Beginning industry 1002
Finished scraping job 93 of 3759
Finished scraping job 104 of 3759
Beginning industry 1020
Finished scraping job 134 of 3759
Finished scraping job 164 of 3759
Finished scraping job 194 of 3759
Finished scraping job 224 of 3759
Finished scraping job 254 of 3759
Finished scraping job 284 of 3759
Finished scraping job 314 of 3759
Finished scraping job 344 of 3759
Finished scraping job 374 of 3759
Finished scraping job 404 of 3759
Finished scraping job 434 of 3759
Finished scraping job 464 of 3759
Finished scraping job 494 of 3759
Finished scraping job 524 of 3759
Finished scraping job 554 of 3759
Finished scraping job 584 of 3759
Finished scraping job 614 of 3759
Finished scraping job 644 of 3759
Finished scraping job 674 of 3759
Finished scraping job 704 of 3759
Finished scraping job 726 of 3759
Beginning industr

In [605]:
initialize_glassdoor_search(browser, 'new york state' + ', us')
scrape_jobs(browser, "Job_Data/new_york.json")

Beginning scrape of 1688 jobs
Beginning industry 1000
Finished scraping job 30 of 1688
Finished scraping job 35 of 1688
Beginning industry 1020
Finished scraping job 65 of 1688
Finished scraping job 95 of 1688
Finished scraping job 125 of 1688
Finished scraping job 155 of 1688
Finished scraping job 172 of 1688
Beginning industry 1021
Finished scraping job 202 of 1688
Finished scraping job 232 of 1688
Finished scraping job 238 of 1688
Beginning industry 1027
Finished scraping job 268 of 1688
Finished scraping job 298 of 1688
Finished scraping job 328 of 1688
Finished scraping job 332 of 1688
Beginning industry 1031
Finished scraping job 362 of 1688
Finished scraping job 392 of 1688
Finished scraping job 422 of 1688
Finished scraping job 452 of 1688
Finished scraping job 482 of 1688
Finished scraping job 512 of 1688
Finished scraping job 542 of 1688
Finished scraping job 572 of 1688
Finished scraping job 596 of 1688
Beginning industry 1043
Finished scraping job 626 of 1688
Finished scrap

In [607]:
initialize_glassdoor_search(browser, 'massachusetts' + ', us')
scrape_jobs(browser, "Job_Data/massachusetts.json")

Beginning scrape of 1363 jobs
Beginning industry 1002
Finished scraping job 30 of 1363
Finished scraping job 60 of 1363
Finished scraping job 60 of 1363
Beginning industry 1020
Finished scraping job 90 of 1363
Finished scraping job 120 of 1363
Finished scraping job 150 of 1363
Finished scraping job 180 of 1363
Finished scraping job 210 of 1363
Finished scraping job 240 of 1363
Finished scraping job 270 of 1363
Finished scraping job 300 of 1363
Finished scraping job 330 of 1363
Finished scraping job 360 of 1363
Finished scraping job 390 of 1363
Finished scraping job 420 of 1363
Finished scraping job 450 of 1363
Finished scraping job 480 of 1363
Finished scraping job 510 of 1363
Finished scraping job 510 of 1363
Finished scraping job 517 of 1363
Beginning industry 1021
Finished scraping job 539 of 1363
Beginning industry 1022
Finished scraping job 551 of 1363
Beginning industry 1027
Finished scraping job 581 of 1363
Finished scraping job 601 of 1363
Beginning industry 1031
Finished scrap

In [609]:
initialize_glassdoor_search(browser, 'texas' + ', us')
scrape_jobs(browser, "Job_Data/texas.json")

Beginning scrape of 914 jobs
Beginning industry 1000
Finished scraping job 29 of 914
Beginning industry 1002
Finished scraping job 52 of 914
Beginning industry 1020
Finished scraping job 82 of 914
Finished scraping job 85 of 914
Beginning industry 1021
Finished scraping job 100 of 914
Beginning industry 1022
Finished scraping job 115 of 914
Beginning industry 1027
Finished scraping job 145 of 914
Finished scraping job 174 of 914
Beginning industry 1031
Finished scraping job 204 of 914
Finished scraping job 234 of 914
Finished scraping job 264 of 914
Finished scraping job 275 of 914
Beginning industry 1043
Finished scraping job 305 of 914
Finished scraping job 335 of 914
Finished scraping job 339 of 914
Beginning industry 1047
Finished scraping job 369 of 914
Finished scraping job 399 of 914
Finished scraping job 429 of 914
Finished scraping job 444 of 914
Beginning industry 1058
Finished scraping job 474 of 914
Finished scraping job 504 of 914
Finished scraping job 534 of 914
Finished 

In [611]:
initialize_glassdoor_search(browser, 'illinois' + ', us')
scrape_jobs(browser, "Job_Data/illinois.json")

Beginning scrape of 1,067 jobs
Beginning industry 
Finished scraping job 30 of 1,067
Finished scraping job 60 of 1,067
Finished scraping job 90 of 1,067
Finished scraping job 120 of 1,067
Finished scraping job 150 of 1,067
Finished scraping job 180 of 1,067
Finished scraping job 210 of 1,067
Finished scraping job 240 of 1,067
Finished scraping job 270 of 1,067
Finished scraping job 300 of 1,067
Finished scraping job 330 of 1,067
Finished scraping job 360 of 1,067
Finished scraping job 390 of 1,067
Finished scraping job 420 of 1,067
Finished scraping job 450 of 1,067
Finished scraping job 480 of 1,067
Finished scraping job 510 of 1,067
Finished scraping job 540 of 1,067
Finished scraping job 570 of 1,067
Finished scraping job 600 of 1,067
Finished scraping job 630 of 1,067
Finished scraping job 660 of 1,067
Finished scraping job 690 of 1,067
Finished scraping job 720 of 1,067
Finished scraping job 750 of 1,067
Finished scraping job 780 of 1,067
Finished scraping job 810 of 1,067
Finishe

In [613]:
initialize_glassdoor_search(browser, 'washington state' + ', us')
scrape_jobs(browser, "Job_Data/washington.json")

Beginning scrape of 925 jobs
Beginning industry 
Finished scraping job 30 of 925
Finished scraping job 60 of 925
Finished scraping job 90 of 925
Finished scraping job 120 of 925
Finished scraping job 150 of 925
Finished scraping job 180 of 925
Finished scraping job 210 of 925
Finished scraping job 240 of 925
Finished scraping job 270 of 925
Finished scraping job 300 of 925
Finished scraping job 330 of 925
Finished scraping job 360 of 925
Finished scraping job 390 of 925
Finished scraping job 420 of 925
Finished scraping job 450 of 925
Finished scraping job 480 of 925
Finished scraping job 510 of 925
Finished scraping job 540 of 925
Finished scraping job 570 of 925
Finished scraping job 600 of 925
Finished scraping job 630 of 925
Finished scraping job 660 of 925
Finished scraping job 690 of 925
Finished scraping job 720 of 925
Finished scraping job 750 of 925
Finished scraping job 780 of 925
Finished scraping job 810 of 925
Finished scraping job 840 of 925
Finished scraping job 870 of 9

In [614]:
initialize_glassdoor_search(browser, 'pennsylvania' + ', us')
scrape_jobs(browser, "Job_Data/pennsylvania.json")

Beginning scrape of 788 jobs
Beginning industry 
Finished scraping job 30 of 788
Finished scraping job 60 of 788
Finished scraping job 90 of 788
Finished scraping job 120 of 788
Finished scraping job 150 of 788
Finished scraping job 180 of 788
Finished scraping job 210 of 788
Finished scraping job 240 of 788
Finished scraping job 270 of 788
Finished scraping job 300 of 788
Finished scraping job 330 of 788
Finished scraping job 360 of 788
Finished scraping job 390 of 788
Finished scraping job 420 of 788
Finished scraping job 450 of 788
Finished scraping job 480 of 788
Finished scraping job 510 of 788
Finished scraping job 540 of 788
Finished scraping job 570 of 788
Finished scraping job 600 of 788
Finished scraping job 630 of 788
Finished scraping job 660 of 788
Finished scraping job 690 of 788
Finished scraping job 720 of 788
Finished scraping job 750 of 788
Finished scraping job 780 of 788
Finished scraping job 786 of 788
time: 1h 18min 8s


In [617]:
initialize_glassdoor_search(browser, 'wisconsin' + ', us')
scrape_jobs(browser, "Job_Data/wisconsin.json")

Beginning scrape of 232 jobs
Beginning industry 
Finished scraping job 30 of 232
Finished scraping job 60 of 232
Finished scraping job 90 of 232
Finished scraping job 120 of 232
Finished scraping job 150 of 232
Finished scraping job 180 of 232
Finished scraping job 210 of 232
Finished scraping job 232 of 232
time: 23min 32s


In [618]:
initialize_glassdoor_search(browser, 'minnesota' + ', us')
scrape_jobs(browser, "Job_Data/minnesota.json")

Beginning scrape of 290 jobs
Beginning industry 
Finished scraping job 30 of 290
Finished scraping job 60 of 290
Finished scraping job 90 of 290
Finished scraping job 120 of 290
Finished scraping job 150 of 290
Finished scraping job 180 of 290
Finished scraping job 210 of 290
Finished scraping job 240 of 290
Finished scraping job 270 of 290
Finished scraping job 290 of 290
time: 30min 28s


In [619]:
initialize_glassdoor_search(browser, 'alabama' + ', us')
scrape_jobs(browser, "Job_Data/alabama.json")

Beginning scrape of 114 jobs
Beginning industry 
Finished scraping job 30 of 114
Finished scraping job 60 of 114
Finished scraping job 90 of 114
Finished scraping job 114 of 114
time: 14min 31s


In [620]:
initialize_glassdoor_search(browser, 'tennessee' + ', us')
scrape_jobs(browser, "Job_Data/tennessee.json")

Beginning scrape of 197 jobs
Beginning industry 
Finished scraping job 30 of 197
Finished scraping job 60 of 197
Finished scraping job 90 of 197
Finished scraping job 120 of 197
Finished scraping job 150 of 197
Finished scraping job 180 of 197
Finished scraping job 198 of 197
time: 27min 20s


In [623]:
initialize_glassdoor_search(browser, 'florida' + ', us')
scrape_jobs(browser, "Job_Data/florida.json")

Beginning scrape of 616 jobs
Beginning industry 
Finished scraping job 30 of 616
Finished scraping job 60 of 616
Finished scraping job 90 of 616
Finished scraping job 120 of 616
Finished scraping job 150 of 616
Finished scraping job 180 of 616
Finished scraping job 210 of 616
Finished scraping job 240 of 616
Finished scraping job 270 of 616
Finished scraping job 300 of 616
Finished scraping job 330 of 616
Finished scraping job 360 of 616
Finished scraping job 390 of 616
Finished scraping job 420 of 616
Finished scraping job 450 of 616
Finished scraping job 480 of 616
Finished scraping job 510 of 616
Finished scraping job 540 of 616
Finished scraping job 570 of 616
Finished scraping job 600 of 616
Finished scraping job 615 of 616
time: 1h 22min 48s


In [625]:
initialize_glassdoor_search(browser, 'ohio' + ', us')
scrape_jobs(browser, "Job_Data/ohio.json")

Beginning scrape of 544 jobs
Beginning industry 
Finished scraping job 30 of 544
Finished scraping job 60 of 544
Finished scraping job 90 of 544
Finished scraping job 120 of 544
Finished scraping job 150 of 544
Finished scraping job 180 of 544
Finished scraping job 210 of 544
Finished scraping job 240 of 544
Finished scraping job 270 of 544
Finished scraping job 300 of 544
Finished scraping job 330 of 544
Finished scraping job 360 of 544
Finished scraping job 390 of 544
Finished scraping job 420 of 544
Finished scraping job 450 of 544
Finished scraping job 480 of 544
Finished scraping job 510 of 544
Finished scraping job 540 of 544
Finished scraping job 545 of 544
time: 1h 15min 54s


In [627]:
initialize_glassdoor_search(browser, 'michigan' + ', us')
scrape_jobs(browser, "Job_Data/michigan.json")

Beginning scrape of 350 jobs
Beginning industry 
Finished scraping job 30 of 350
Finished scraping job 60 of 350
Finished scraping job 90 of 350
Finished scraping job 120 of 350
Finished scraping job 150 of 350
Finished scraping job 180 of 350
Finished scraping job 210 of 350
Finished scraping job 240 of 350
Finished scraping job 270 of 350
Finished scraping job 300 of 350
Finished scraping job 330 of 350
Finished scraping job 350 of 350
time: 44min 3s


In [637]:
initialize_glassdoor_search(browser, 'connecticut' + ', us')
scrape_jobs(browser, "Job_Data/connecticut.json")

Beginning scrape of 355 jobs
Beginning industry 
Finished scraping job 30 of 355
Finished scraping job 60 of 355
Finished scraping job 90 of 355
Finished scraping job 120 of 355
Finished scraping job 150 of 355
Finished scraping job 180 of 355
Finished scraping job 210 of 355
Finished scraping job 240 of 355
Finished scraping job 270 of 355
Finished scraping job 300 of 355
Finished scraping job 330 of 355
Finished scraping job 355 of 355
time: 33min 13s


In [640]:
initialize_glassdoor_search(browser, 'nebraska' + ', us')
scrape_jobs(browser, "Job_Data/nebraska.json")

Beginning scrape of 61 jobs
Beginning industry 
Finished scraping job 30 of 61
Finished scraping job 60 of 61
Finished scraping job 60 of 61
time: 8min 23s


In [642]:
initialize_glassdoor_search(browser, 'mississippi' + ', us')
scrape_jobs(browser, "Job_Data/mississippi.json")

Beginning scrape of 24 jobs
Beginning industry 
Finished scraping job 24 of 24
time: 3min 39s


In [648]:
initialize_glassdoor_search(browser, 'georgia' + ', us')
scrape_jobs(browser, "Job_Data/georgia.json")

Beginning scrape of 561 jobs
Beginning industry 
Finished scraping job 30 of 561
Finished scraping job 60 of 561
Finished scraping job 90 of 561
Finished scraping job 120 of 561
Finished scraping job 150 of 561
Finished scraping job 180 of 561
Finished scraping job 210 of 561
Finished scraping job 240 of 561
Finished scraping job 270 of 561
Finished scraping job 300 of 561
Finished scraping job 330 of 561
Finished scraping job 360 of 561
Finished scraping job 390 of 561
Finished scraping job 420 of 561
Finished scraping job 450 of 561
Finished scraping job 480 of 561
Finished scraping job 510 of 561
Finished scraping job 540 of 561
Finished scraping job 560 of 561
time: 55min 5s


In [650]:
initialize_glassdoor_search(browser, 'wyoming' + ', us')
scrape_jobs(browser, "Job_Data/wyoming.json")

Beginning scrape of 5 jobs
Beginning industry 
Finished scraping job 5 of 5
time: 1min 41s


In [658]:
initialize_glassdoor_search(browser, 'vermont' + ', us')
scrape_jobs(browser, "Job_Data/vermont.json")

Beginning scrape of 18 jobs
Beginning industry 
Finished scraping job 18 of 18
time: 3min 5s


In [660]:
initialize_glassdoor_search(browser, 'north dakota' + ', us')
scrape_jobs(browser, "Job_Data/north_dakota.json")

Beginning scrape of 14 jobs
Beginning industry 
Finished scraping job 14 of 14
time: 2min 45s


In [662]:
initialize_glassdoor_search(browser, 'alaska' + ', us')
scrape_jobs(browser, "Job_Data/alaska.json")

Beginning scrape of 17 jobs
Beginning industry 
Finished scraping job 17 of 17
time: 2min 52s


In [664]:
initialize_glassdoor_search(browser, 'south dakota' + ', us')
scrape_jobs(browser, "Job_Data/south_dakota.json")

Beginning scrape of 16 jobs
Beginning industry 
Finished scraping job 16 of 16
time: 2min 58s


In [671]:
initialize_glassdoor_search(browser, 'delaware' + ', us')
scrape_jobs(browser, "Job_Data/delaware.json")

Beginning scrape of 141 jobs
Beginning industry 
Finished scraping job 30 of 141
Finished scraping job 60 of 141
Finished scraping job 90 of 141
Finished scraping job 120 of 141
Finished scraping job 141 of 141
time: 16min 32s


In [672]:
initialize_glassdoor_search(browser, 'montana' + ', us')
scrape_jobs(browser, "Job_Data/montana.json")

Beginning scrape of 13 jobs
Beginning industry 
Finished scraping job 13 of 13
time: 2min 25s


In [674]:
initialize_glassdoor_search(browser, 'rhode island' + ', us')
scrape_jobs(browser, "Job_Data/rhode_island.json")

Beginning scrape of 63 jobs
Beginning industry 
Finished scraping job 30 of 63
Finished scraping job 60 of 63
Finished scraping job 63 of 63
time: 8min 23s


In [676]:
initialize_glassdoor_search(browser, 'new hampshire' + ', us')
scrape_jobs(browser, "Job_Data/new_hampshire.json")

Beginning scrape of 64 jobs
Beginning industry 
Finished scraping job 30 of 64
Finished scraping job 60 of 64
Finished scraping job 64 of 64
time: 8min 44s


In [678]:
initialize_glassdoor_search(browser, 'maine' + ', us')
scrape_jobs(browser, "Job_Data/maine.json")

Beginning scrape of 51 jobs
Beginning industry 
Finished scraping job 30 of 51
Finished scraping job 51 of 51
time: 7min 9s


In [680]:
initialize_glassdoor_search(browser, 'hawaii' + ', us')
scrape_jobs(browser, "Job_Data/hawaii.json")

Beginning scrape of 28 jobs
Beginning industry 
Finished scraping job 28 of 28
time: 4min 31s


In [682]:
initialize_glassdoor_search(browser, 'idaho' + ', us')
scrape_jobs(browser, "Job_Data/idaho.json")

Beginning scrape of 50 jobs
Beginning industry 
Finished scraping job 30 of 50
Finished scraping job 50 of 50
time: 7min 8s


In [684]:
initialize_glassdoor_search(browser, 'west virginia' + ', us')
scrape_jobs(browser, "Job_Data/west_virginia.json")

Beginning scrape of 18 jobs
Beginning industry 
Finished scraping job 18 of 18
time: 3min 22s


In [686]:
initialize_glassdoor_search(browser, 'new mexico' + ', us')
scrape_jobs(browser, "Job_Data/new_mexico.json")

Beginning scrape of 88 jobs
Beginning industry 
Finished scraping job 30 of 88
Finished scraping job 60 of 88
Finished scraping job 88 of 88
time: 11min 35s


In [688]:
initialize_glassdoor_search(browser, 'nevada' + ', us')
scrape_jobs(browser, "Job_Data/nevada.json")

Beginning scrape of 55 jobs
Beginning industry 
Finished scraping job 30 of 55
Finished scraping job 55 of 55
time: 8min 28s


In [693]:
initialize_glassdoor_search(browser, 'kansas' + ', us')
scrape_jobs(browser, "Job_Data/kansas.json")

Beginning scrape of 73 jobs
Beginning industry 
Finished scraping job 30 of 73
Finished scraping job 60 of 73
Finished scraping job 73 of 73
time: 10min 3s


In [695]:
initialize_glassdoor_search(browser, 'virginia' + ', us')
scrape_jobs(browser, "Job_Data/virginia.json")

Beginning scrape of 911 jobs
Beginning industry 1000
Finished scraping job 30 of 911
Finished scraping job 40 of 911
Beginning industry 1002
Finished scraping job 70 of 911
Finished scraping job 100 of 911
Finished scraping job 130 of 911
Finished scraping job 160 of 911
Finished scraping job 182 of 911
Beginning industry 1020
Finished scraping job 212 of 911
Finished scraping job 238 of 911
Beginning industry 1021
Finished scraping job 252 of 911
Beginning industry 1022
Finished scraping job 263 of 911
Beginning industry 1027
Finished scraping job 293 of 911
Finished scraping job 323 of 911
Finished scraping job 353 of 911
Finished scraping job 380 of 911
Beginning industry 1031
Finished scraping job 410 of 911
Finished scraping job 440 of 911
Finished scraping job 453 of 911
Beginning industry 1032
Finished scraping job 463 of 911
Beginning industry 1043
Finished scraping job 493 of 911
Finished scraping job 498 of 911
Beginning industry 1047
Finished scraping job 528 of 911
Finished

In [701]:
initialize_glassdoor_search(browser, 'arkansas' + ', us')
scrape_jobs(browser, "Job_Data/arkansas.json")

Beginning scrape of 52 jobs
Beginning industry 
Finished scraping job 30 of 52
Finished scraping job 52 of 52
time: 7min 4s


In [702]:
initialize_glassdoor_search(browser, 'colorado' + ', us')
scrape_jobs(browser, "Job_Data/colorado.json")

Beginning scrape of 367 jobs
Beginning industry 
Finished scraping job 30 of 367
Finished scraping job 60 of 367
Finished scraping job 90 of 367
Finished scraping job 120 of 367
Finished scraping job 150 of 367
Finished scraping job 180 of 367
Finished scraping job 210 of 367
Finished scraping job 240 of 367
Finished scraping job 270 of 367
Finished scraping job 300 of 367
Finished scraping job 330 of 367
Finished scraping job 360 of 367
Finished scraping job 366 of 367
time: 35min 8s


In [707]:
initialize_glassdoor_search(browser, 'iowa' + ', us')
scrape_jobs(browser, "Job_Data/iowa.json")

Beginning scrape of 102 jobs
Beginning industry 
Finished scraping job 30 of 102
Finished scraping job 60 of 102
Finished scraping job 90 of 102
Finished scraping job 102 of 102
time: 11min 36s


In [710]:
initialize_glassdoor_search(browser, 'arizona' + ', us')
scrape_jobs(browser, "Job_Data/arizona.json")

Beginning scrape of 252 jobs
Beginning industry 
Finished scraping job 30 of 252
Finished scraping job 60 of 252
Finished scraping job 90 of 252
Finished scraping job 120 of 252
Finished scraping job 150 of 252
Finished scraping job 180 of 252
Finished scraping job 210 of 252
Finished scraping job 240 of 252
Finished scraping job 252 of 252
time: 24min 56s


In [713]:
initialize_glassdoor_search(browser, 'washington dc' + ', us')
scrape_jobs(browser, "Job_Data/washington_dc.json")

Beginning scrape of 425 jobs
Beginning industry 
Finished scraping job 30 of 425
Finished scraping job 60 of 425
Finished scraping job 90 of 425
Finished scraping job 120 of 425
Finished scraping job 150 of 425
Finished scraping job 180 of 425
Finished scraping job 210 of 425
Finished scraping job 240 of 425
Finished scraping job 270 of 425
Finished scraping job 300 of 425
Finished scraping job 330 of 425
Finished scraping job 360 of 425
Finished scraping job 390 of 425
Finished scraping job 420 of 425
Finished scraping job 425 of 425
time: 41min 32s


In [718]:
initialize_glassdoor_search(browser, 'utah' + ', us')
scrape_jobs(browser, "Job_Data/utah.json")

Beginning scrape of 136 jobs
Beginning industry 
Finished scraping job 30 of 136
Finished scraping job 60 of 136
Finished scraping job 90 of 136
Finished scraping job 120 of 136
Finished scraping job 136 of 136
time: 15min 31s


In [720]:
initialize_glassdoor_search(browser, 'north carolina' + ', us')
scrape_jobs(browser, "Job_Data/north_carolina.json")

Beginning scrape of 670 jobs
Beginning industry 
Finished scraping job 30 of 670
Finished scraping job 60 of 670
Finished scraping job 90 of 670
Finished scraping job 120 of 670
Finished scraping job 150 of 670
Finished scraping job 180 of 670
Finished scraping job 210 of 670
Finished scraping job 240 of 670
Finished scraping job 270 of 670
Finished scraping job 300 of 670
Finished scraping job 330 of 670
Finished scraping job 360 of 670
Finished scraping job 390 of 670
Finished scraping job 420 of 670
Finished scraping job 450 of 670
Finished scraping job 480 of 670
Finished scraping job 510 of 670
Finished scraping job 540 of 670
Finished scraping job 570 of 670
Finished scraping job 600 of 670
Finished scraping job 630 of 670
Finished scraping job 660 of 670
Finished scraping job 670 of 670
time: 1h 25min 23s


In [731]:
initialize_glassdoor_search(browser, 'south carolina' + ', us')
scrape_jobs(browser, "Job_Data/south_carolina.json")

Beginning scrape of 72 jobs
Beginning industry 
Finished scraping job 30 of 72
Finished scraping job 60 of 72
Finished scraping job 72 of 72
time: 8min 16s


In [732]:
initialize_glassdoor_search(browser, 'oregon' + ', us')
scrape_jobs(browser, "Job_Data/oregon.json")

Beginning scrape of 190 jobs
Beginning industry 
Finished scraping job 30 of 190
Finished scraping job 60 of 190
Finished scraping job 90 of 190
Finished scraping job 120 of 190
Finished scraping job 150 of 190
Finished scraping job 180 of 190
Finished scraping job 190 of 190
time: 25min 30s


In [734]:
initialize_glassdoor_search(browser, 'louisiana' + ', us')
scrape_jobs(browser, "Job_Data/louisiana.json")

Beginning scrape of 79 jobs
Beginning industry 
Finished scraping job 30 of 79
Finished scraping job 60 of 79
Finished scraping job 79 of 79
time: 11min 12s


In [736]:
initialize_glassdoor_search(browser, 'kentucky' + ', us')
scrape_jobs(browser, "Job_Data/kentucky.json")

Beginning scrape of 63 jobs
Beginning industry 
Finished scraping job 30 of 63
Finished scraping job 60 of 63
Finished scraping job 63 of 63
time: 9min 25s


In [738]:
initialize_glassdoor_search(browser, 'oklahoma' + ', us')
scrape_jobs(browser, "Job_Data/oklahoma.json")

Beginning scrape of 47 jobs
Beginning industry 
Finished scraping job 30 of 47
Finished scraping job 47 of 47
time: 7min 51s


In [742]:
initialize_glassdoor_search(browser, 'indiana' + ', us')
scrape_jobs(browser, "Job_Data/indiana.json")

Beginning scrape of 211 jobs
Beginning industry 
Finished scraping job 30 of 211
Finished scraping job 60 of 211
Finished scraping job 90 of 211
Finished scraping job 120 of 211
Finished scraping job 150 of 211
Finished scraping job 180 of 211
Finished scraping job 210 of 211
Finished scraping job 211 of 211
time: 27min 21s


In [743]:
initialize_glassdoor_search(browser, 'missouri' + ', us')
scrape_jobs(browser, "Job_Data/missouri.json")

Beginning scrape of 397 jobs
Beginning industry 
Finished scraping job 30 of 397
Finished scraping job 60 of 397
Finished scraping job 90 of 397
Finished scraping job 120 of 397
Finished scraping job 150 of 397
Finished scraping job 180 of 397
Finished scraping job 210 of 397
Finished scraping job 240 of 397
Finished scraping job 270 of 397
Finished scraping job 300 of 397
Finished scraping job 330 of 397
Finished scraping job 360 of 397
Finished scraping job 390 of 397
Finished scraping job 397 of 397
time: 51min 26s


In [748]:
initialize_glassdoor_search(browser, 'new jersey' + ', us')
scrape_jobs(browser, "Job_Data/new_jersey.json")

Beginning scrape of 858 jobs
Beginning industry 
Finished scraping job 30 of 858
Finished scraping job 60 of 858
Finished scraping job 90 of 858
Finished scraping job 120 of 858
Finished scraping job 150 of 858
Finished scraping job 180 of 858
Finished scraping job 210 of 858
Finished scraping job 240 of 858
Finished scraping job 270 of 858
Finished scraping job 300 of 858
Finished scraping job 330 of 858
Finished scraping job 360 of 858
Finished scraping job 390 of 858
Finished scraping job 420 of 858
Finished scraping job 450 of 858
Finished scraping job 480 of 858
Finished scraping job 510 of 858
Finished scraping job 540 of 858
Finished scraping job 570 of 858
Finished scraping job 600 of 858
Finished scraping job 630 of 858
Finished scraping job 660 of 858
Finished scraping job 690 of 858
Finished scraping job 720 of 858
Finished scraping job 750 of 858
Finished scraping job 780 of 858
Finished scraping job 810 of 858
Finished scraping job 840 of 858
Finished scraping job 866 of 8

In [750]:
initialize_glassdoor_search(browser, 'maryland' + ', us')
scrape_jobs(browser, "Job_Data/maryland.json")

Beginning scrape of 756 jobs
Beginning industry 
Finished scraping job 30 of 756
Finished scraping job 60 of 756
Finished scraping job 90 of 756
Finished scraping job 120 of 756
Finished scraping job 150 of 756
Finished scraping job 180 of 756
Finished scraping job 210 of 756
Finished scraping job 240 of 756
Finished scraping job 270 of 756
Finished scraping job 300 of 756
Finished scraping job 330 of 756
Finished scraping job 360 of 756
Finished scraping job 390 of 756
Finished scraping job 420 of 756
Finished scraping job 450 of 756
Finished scraping job 480 of 756
Finished scraping job 510 of 756
Finished scraping job 540 of 756
Finished scraping job 570 of 756
Finished scraping job 600 of 756
Finished scraping job 630 of 756
Finished scraping job 660 of 756
Finished scraping job 690 of 756
Finished scraping job 720 of 756
Finished scraping job 750 of 756
Finished scraping job 756 of 756
time: 1h 32min 28s


In [None]:
# 40.6 hours of total scraping