In [24]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException
from selenium import webdriver
import time
import pandas as pd

In [27]:
def get_jobs(keyword, num_jobs, verbose):
    ''' Gathers jobs as a dataframe, scraped from Glassdoor'''
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
#     options.add_argument('headless')
    driver = webdriver.Chrome(executable_path="/Users/cd/Desktop/Projects/ds_salary/chromedriver", options=options)
    driver.set_window_size(1120,1000)
    
    url = 'https://www.glassdoor.com.au/Job/boston-data-science-jobs-SRCH_IL.0,6_IC1154532_KO7,19.htm?clickSource=searchBtn&typedKeyword=data+science&sc.keyword=Data+Science&locT=C&suggestCount=0&jobType=&locId=1154532&suggestChosen=true&countryRedirect=true'
    driver.get(url)
    jobs = []
    
    while len(jobs) < num_jobs: #If true, should be still looking for new jobs.
        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded instead of hardcoding it.
        time.sleep(4)
        
        #Test for the "Sign Up prompt and get rid of it
        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass
        time.sleep(.1)
        
        try:
            driver.find_element_by_class_name("ModalStyle_xBtn_29PT9").click() #Clicking to the X.
        except NoSuchElementException:
            pass
        
        #Going through each job in this page
        job_buttons = driver.find_elements_by_class_name("jl") #j1 for Job Listing. These are the buttons we're going to click
        for job_button in job_buttons:
            
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break
            try:
                job_button.click()
            except ElementNotInteractableException:
                continue
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                    job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)
            try:
                salary_estimate = job_button.find_element_by_xpath('.//span[@class="gray salary"]').text
                print(salary_estimate)
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))

            #Going to the Company tab...
            #clicking on this:
            #<div class="tab" data-tab-type="overview"><span>Company</span></div>
            try:
                driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()

                try:
                    #<div class="infoEntity">
                    #    <label>Headquarters</label>
                    #    <span class="value">San Francisco, CA</span>
                    #</div>
                    headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
                except NoSuchElementException:
                    headquarters = -1

                try:
                    size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = -1

                try:
                    founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    founded = -1

                try:
                    type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = -1

                try:
                    sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = -1

                try:
                    revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = -1

                try:
                    competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
                except NoSuchElementException:
                    competitors = -1

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                headquarters = -1
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1
                competitors = -1

                
            if verbose:
                print("Headquarters: {}".format(headquarters))
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Competitors: {}".format(competitors))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Headquarters" : headquarters,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue,
            "Competitors" : competitors})
            #add job to jobs

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame



In [31]:

#This line will open a new chrome window and start the scraping.
df = get_jobs("data scientist", 500, False)
df

num_jobs 500
job: []
Progress: 0/500
$96K-$101K (Glassdoor Est.)
Progress: 1/500
$154K-$248K (Glassdoor Est.)
Progress: 2/500
Progress: 3/500
$60K-$63K (Glassdoor Est.)
Progress: 4/500
$75K-$131K (Glassdoor Est.)
Progress: 5/500
$84K-$137K (Glassdoor Est.)
Progress: 6/500
Progress: 6/500
$99K-$174K (Glassdoor Est.)
Progress: 7/500
Progress: 7/500
$50K-$96K (Glassdoor Est.)
Progress: 8/500
Progress: 8/500
$86K-$125K (Glassdoor Est.)
Progress: 9/500
$72K-$124K (Glassdoor Est.)
Progress: 10/500
Progress: 10/500
$101K-$130K (Glassdoor Est.)
Progress: 11/500
Progress: 11/500
$93K-$127K (Glassdoor Est.)
Progress: 12/500
$83K-$100K (Glassdoor Est.)
Progress: 13/500
Progress: 13/500
$65K-$118K (Glassdoor Est.)
Progress: 14/500
Progress: 14/500
$147K-$195K (Glassdoor Est.)
Progress: 15/500
Progress: 15/500
Progress: 16/500
Progress: 16/500
$120K-$140K(Employer Est.)
Progress: 17/500
$46K-$56K (Glassdoor Est.)
Progress: 18/500
Progress: 18/500
Progress: 19/500
num_jobs 500
job: [{'Job Title': 'D

Progress: 19/500
$53K-$94K (Glassdoor Est.)
Progress: 20/500
$150K-$244K (Glassdoor Est.)
Progress: 21/500
$104K-$198K (Glassdoor Est.)
Progress: 22/500
$90K-$147K (Glassdoor Est.)
Progress: 23/500
$37K-$68K (Glassdoor Est.)
Progress: 24/500
$97K-$158K (Glassdoor Est.)
Progress: 25/500
$68K-$115K (Glassdoor Est.)
Progress: 26/500
Progress: 26/500
$59K-$111K (Glassdoor Est.)
Progress: 27/500
Progress: 27/500
$84K-$137K (Glassdoor Est.)
Progress: 28/500
Progress: 29/500
Progress: 29/500
$134K-$261K (Glassdoor Est.)
Progress: 30/500
$110K-$163K (Glassdoor Est.)
Progress: 31/500
$67K-$115K (Glassdoor Est.)
Progress: 32/500
$83K-$144K (Glassdoor Est.)
Progress: 33/500
$39K-$69K (Glassdoor Est.)
Progress: 34/500
$81K-$143K (Glassdoor Est.)
Progress: 35/500
$161K-$257K (Glassdoor Est.)
Progress: 36/500
Progress: 37/500
$39K-$73K (Glassdoor Est.)
Progress: 38/500
$47K-$57K (Glassdoor Est.)
Progress: 39/500
$99K-$174K (Glassdoor Est.)
Progress: 40/500
$75K-$133K (Glassdoor Est.)
Progress: 41/50

Progress: 45/500
$95K-$118K (Glassdoor Est.)
Progress: 46/500
$101K-$157K (Glassdoor Est.)
Progress: 47/500
$101K-$133K (Glassdoor Est.)
Progress: 48/500
$87K-$115K (Glassdoor Est.)
Progress: 49/500
$116K-$135K (Glassdoor Est.)
Progress: 50/500
$112K-$202K (Glassdoor Est.)
Progress: 51/500
Progress: 51/500
$65K-$125K (Glassdoor Est.)
Progress: 52/500
Progress: 53/500
$105K-$170K (Glassdoor Est.)
Progress: 54/500
$128K-$204K (Glassdoor Est.)
Progress: 55/500
Progress: 56/500
Progress: 57/500
$105K-$172K (Glassdoor Est.)
Progress: 58/500
Progress: 58/500
$65K-$124K (Glassdoor Est.)
Progress: 59/500
$90K-$103K (Glassdoor Est.)
Progress: 60/500
$101K-$133K (Glassdoor Est.)
Progress: 61/500
$60K-$117K (Glassdoor Est.)
Progress: 62/500
$117K-$182K (Glassdoor Est.)
Progress: 63/500
Progress: 64/500
$73K-$138K (Glassdoor Est.)
Progress: 65/500
$61K-$69K (Glassdoor Est.)
Progress: 66/500
$94K-$176K (Glassdoor Est.)
Progress: 67/500
Progress: 68/500
$115K-$140K (Glassdoor Est.)
Progress: 69/500


Progress: 73/500
Progress: 74/500
$70K-$124K (Glassdoor Est.)
Progress: 75/500
$68K-$132K (Glassdoor Est.)
Progress: 76/500
Progress: 77/500
$93K-$152K (Glassdoor Est.)
Progress: 78/500
$84K-$170K (Glassdoor Est.)
Progress: 79/500
$104K-$186K (Glassdoor Est.)
Progress: 80/500
$84K-$157K (Glassdoor Est.)
Progress: 81/500
$17-$27 Per Hour(Glassdoor Est.)
Progress: 82/500
$75K-$83K (Glassdoor Est.)
Progress: 83/500
$99K-$130K (Glassdoor Est.)
Progress: 84/500
$169K-$281K (Glassdoor Est.)
Progress: 85/500
$69K-$115K (Glassdoor Est.)
Progress: 86/500
Progress: 86/500
$87K-$127K (Glassdoor Est.)
Progress: 87/500
$63K-$81K (Glassdoor Est.)
Progress: 88/500
$78K-$125K (Glassdoor Est.)
Progress: 89/500
$89K-$147K (Glassdoor Est.)
Progress: 90/500
$129K-$164K (Glassdoor Est.)
Progress: 91/500
$99K-$128K (Glassdoor Est.)
Progress: 92/500
$73K-$123K (Glassdoor Est.)
Progress: 93/500
Progress: 94/500
$101K-$153K (Glassdoor Est.)
Progress: 95/500
Progress: 96/500
$98K-$142K (Glassdoor Est.)
Progress

Progress: 101/500
$121K-$130K (Glassdoor Est.)
Progress: 102/500
$76K-$138K (Glassdoor Est.)
Progress: 103/500
Progress: 104/500
$48K-$78K (Glassdoor Est.)
Progress: 105/500
Progress: 106/500
$118K-$187K (Glassdoor Est.)
Progress: 107/500
$85K-$106K (Glassdoor Est.)
Progress: 108/500
$47K-$85K (Glassdoor Est.)
Progress: 109/500
Progress: 109/500
$36K-$64K (Glassdoor Est.)
Progress: 110/500
Progress: 111/500
Progress: 112/500
$107K-$161K (Glassdoor Est.)
Progress: 113/500
$110K-$173K (Glassdoor Est.)
Progress: 114/500
$113K-$115K (Glassdoor Est.)
Progress: 115/500
Progress: 116/500
$86K-$159K (Glassdoor Est.)
Progress: 117/500
$48K-$92K (Glassdoor Est.)
Progress: 118/500
$86K-$176K (Glassdoor Est.)
Progress: 119/500
Progress: 120/500
$93K-$184K (Glassdoor Est.)
Progress: 121/500
Progress: 122/500
Progress: 123/500
Progress: 124/500
$115K-$135K (Glassdoor Est.)
Progress: 125/500
$70K-$115K (Glassdoor Est.)
Progress: 126/500
Progress: 127/500
Progress: 128/500
$95K-$164K (Glassdoor Est.)


Progress: 130/500
$93K-$149K (Glassdoor Est.)
Progress: 131/500
Progress: 132/500
$125K-$189K (Glassdoor Est.)
Progress: 133/500
$123K-$201K (Glassdoor Est.)
Progress: 134/500
$76K-$176K (Glassdoor Est.)
Progress: 135/500
$92K-$121K (Glassdoor Est.)
Progress: 136/500
$62K-$89K (Glassdoor Est.)
Progress: 137/500
Progress: 138/500
$41K-$76K (Glassdoor Est.)
Progress: 139/500
$38K-$81K (Glassdoor Est.)
Progress: 140/500
$101K-$135K (Glassdoor Est.)
Progress: 141/500
$56K-$93K (Glassdoor Est.)
Progress: 142/500
$68K-$113K (Glassdoor Est.)
Progress: 143/500
$153K-$306K (Glassdoor Est.)
Progress: 144/500
$55K-$101K (Glassdoor Est.)
Progress: 145/500
$101K-$133K (Glassdoor Est.)
Progress: 146/500
$72K-$117K (Glassdoor Est.)
Progress: 147/500
$95K-$168K (Glassdoor Est.)
Progress: 148/500
$71K-$118K (Glassdoor Est.)
Progress: 149/500
$101K-$112K (Glassdoor Est.)
Progress: 150/500
$44K-$87K (Glassdoor Est.)
Progress: 151/500
Progress: 152/500
Progress: 153/500
$112K-$138K (Glassdoor Est.)
Progre

Progress: 159/500


KeyboardInterrupt: 

In [32]:
df.to_csv('dataScienceBoston.csv')

In [33]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,Data Scientist / Machine Learning Expert,$96K-$101K (Glassdoor Est.),Posting Title\nData Scientist / Machine Learni...,3.9,Novartis\n3.9,"Cambridge, MA","Basel, Switzerland",10000+ employees,1996,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$10+ billion (AUD),-1
1,"Director II, Data Science - Office of Data Sci...",$154K-$248K (Glassdoor Est.),Help shape the future of Data Science across L...,3.4,Liberty Mutual Insurance\n3.4,"Boston, MA","Boston, MA",10000+ employees,1912,Company - Private,Insurance Operators,Insurance,$10+ billion (AUD),"Travelers, Allstate, State Farm"
2,Internship: Intelligent Coding,-1,The Signal Processing group at MERL is seeking...,3.8,Mitsubishi Electric Research Labs\n3.8,"Cambridge, MA","Cambridge, MA",51 to 200 employees,1991,Subsidiary or Business Segment,Research & Development,Business Services,$10 to $25 million (AUD),"Google, Amazon, NVIDIA"
3,Data Scientist,$75K-$131K (Glassdoor Est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (AUD),-1
4,Quantitative Data Analyst,$60K-$63K (Glassdoor Est.),"At Cogo Labs, we build startup companies from ...",3.6,Cogo Labs\n3.6,"Cambridge, MA","Cambridge, MA",51 to 200 employees,2005,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,-1
