In [2]:
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
import re
import pickle

## Functions

In [3]:
def get_job_urls(url, web_site):
    '''
    Returns urls for jobs posted on a page.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    elements = soup.find_all('div', {'class':'title'})
    
    job_urls = [el.find('a').get('href') for el in elements]
    job_urls = [web_site + url for url in job_urls]
    
    return job_urls 

In [4]:
def get_pages_urls(start_page):
    '''
    Returns urls for indeed pages with multiple job postings. 
    '''
    page_urls = [start_page]
    
    for i in range(10, 3000, 10): #change to 3,000
        page = os.path.join('{}&start={}'.format(start_page, i))
        page_urls.append(page)
        
    return page_urls     

In [5]:
# Discontinued this function
def get_text(job_url):
    '''
    Returns all text from a job posting page. 
    '''
    job_page = requests.get(job_urls[0])
    job_soup = BeautifulSoup(job_page.content)

    for script in job_soup(['script', 'style']):
        script.extract()
        
    job_text = job_soup.get_text()
    return job_text

In [6]:
# Function for getting a job title from one url
def get_job_title(url):
    '''
    Extracts a job title from a job posting page.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    job_title = soup.find('h3', {'class':'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title'}).text
    
    return job_title

In [7]:
# Function for getting a company name from one url
def get_company_name(url):
    '''
    Extracts a hiring company name from a job posting page.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    company_name = soup.find('div', {'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).text
    
    return company_name

In [8]:
# Function for getting a salary from one url
def get_salary(url):
    '''
    Extracts a salary from a job posting page.
    '''    
    company_location = []
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    salary = soup.find('span', {'class':'icl-u-xs-mr--xs'}).text
#     salary = re.sub('[^0-9]', '', salary)
    
    return salary

In [9]:
# Function for getting a company location from one url
def get_company_location(url):
    '''
    Extracts a hiring company location from a job posting page.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    location = soup.find('title').text.split('-')[1]
    
    return location

In [10]:
# Function for getting a job description from one url
def get_job_description(url):
    '''
    Extracts a job discription from a job posting page.
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    description = soup.find('div', {'class':'jobsearch-jobDescriptionText'}).text
    
    return description

In [11]:
# Scraping function for job posting urls:
def scrape_urls(web_site, city, job_title_list, state):
    '''
    Collects urls for job postings.
    
    Arguments: 
        web_site = job board/search engine website
        city = state to perform the job search for
        job_title_list = list of position titles
        state = state to perform the job search for
    Outputs: 
        list of job posting urls
    '''
    job_url_list = []
    for title in job_title_list:
        
        # Starting page for scraping:
        start_page = os.path.join(web_site, 
                          'jobs?q=%22{}%22&l={}%2C+{}&radius=50&sort=date'.format(title, city, state))

        # Get urls for pages:
        pages_urls = get_pages_urls(start_page)
        
        # Get urls for jobs:
        all_job_urls = []
        count = 1
        
        for page_url in pages_urls:
            print('collecting for {} from page {} of {} pages'.format(title, count, len(pages_urls)))
            job_urls  = get_job_urls(page_url, web_site)
            all_job_urls.append(job_urls)
            count += 1
                
        all_job_urls = [url for sublist in all_job_urls for url in sublist] # flatten the list
                
        job_url_list.append(all_job_urls) # append urls from all titles we searched for

              
    all_job_urls = [url for sublist in job_url_list for url in sublist] # flatten the list

    # Check for unique job urls:
    place_job_urls = set(all_job_urls)
    print('Number of unique urls: {}, number of all urls collected: {}'.format(len(place_job_urls), 
                                                                               len(all_job_urls)))
    
    return place_job_urls

In [12]:
# Collects data from job posting urls:
def collect_data(urls, file_name):
    '''
    Collects following job posting data - description, title, company name, company location, salary if posted.
    Arguments:
        urls
    Output:
        Dataframe with job postings as rows and collected data as features.
    '''
    data_dict = {}
    data = pd.DataFrame()
    count = 0
    
    folder = '/Users/greenapple/project4/data/interim/'
#   file_name = str(file_name)
    save_location = os.path.join(folder, '{}.pkl'.format(file_name))  
    
    for url in urls:
        
        try:
            description = get_job_description(url) # Job description
            data_dict['description'] = description
        except:
            pass
        
        try:
            job_title = get_job_title(url) # Job title
            data_dict['job_title'] = job_title
        except:
            pass
        
        try:
            company_name = get_company_name(url)
            data_dict['company_name'] = company_name
        except:
            pass
        
        try:
            location = get_company_location(url)
            data_dict['location'] = location
        except:
            pass
        
        try:
            salary = get_salary(url)
            data_dict['salary'] = salary
        except:
            pass
        
        data_dict['link'] = url
    
        data = data.append(data_dict, ignore_index=True)
        
        # pickle dataframe   
        pickle.dump(data, open(save_location, 'wb'))
        
        # Print update
        count+=1
        print('data point {}'.format(count))
    
    return data

## Collect urls for WA

In [None]:
# Collect job urls for Seattle

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Seattle'
job_title_list = ['data+scientist',
                  'data+analyst']
state = 'wa'

seattle_urls = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('seattle_urls_110519.pkl', 'wb')
pickle.dump(seattle_urls, pickling_in)

In [None]:
len(seattle_urls)

In [None]:
# Collect job urls for Bellevue:

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Bellevue'
job_title_list = ['data+scientist',
                  'data+analyst']
state = 'wa'

bellevue_urls = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('bellevue_urls_110519.pkl', 'wb')
pickle.dump(bellevue_urls, pickling_in)

In [None]:
len(bellevue_urls)

In [None]:
# Collect non data scientist/analyst job urls for Seattle

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Seattle'
job_title_list = ['clinical+trial',
                  'civil+engineer']
state = 'wa'

seattle_urls_neg = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('seattle_urls_neg_110519.pkl', 'wb')
pickle.dump(seattle_urls_neg, pickling_in)

In [13]:
# Collect clinical+trial job urls for Seattle

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Seattle'
job_title_list = ['clinical+trial']
state = 'wa'

seattle_urls_neg = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('seattle_urls_clinical_trial_111319.pkl', 'wb')
pickle.dump(seattle_urls_neg, pickling_in)

collecting for clinical+trial from page 1 of 300 pages
collecting for clinical+trial from page 2 of 300 pages
collecting for clinical+trial from page 3 of 300 pages
collecting for clinical+trial from page 4 of 300 pages
collecting for clinical+trial from page 5 of 300 pages
collecting for clinical+trial from page 6 of 300 pages
collecting for clinical+trial from page 7 of 300 pages
collecting for clinical+trial from page 8 of 300 pages
collecting for clinical+trial from page 9 of 300 pages
collecting for clinical+trial from page 10 of 300 pages
collecting for clinical+trial from page 11 of 300 pages
collecting for clinical+trial from page 12 of 300 pages
collecting for clinical+trial from page 13 of 300 pages
collecting for clinical+trial from page 14 of 300 pages
collecting for clinical+trial from page 15 of 300 pages
collecting for clinical+trial from page 16 of 300 pages
collecting for clinical+trial from page 17 of 300 pages
collecting for clinical+trial from page 18 of 300 pages
c

collecting for clinical+trial from page 147 of 300 pages
collecting for clinical+trial from page 148 of 300 pages
collecting for clinical+trial from page 149 of 300 pages
collecting for clinical+trial from page 150 of 300 pages
collecting for clinical+trial from page 151 of 300 pages
collecting for clinical+trial from page 152 of 300 pages
collecting for clinical+trial from page 153 of 300 pages
collecting for clinical+trial from page 154 of 300 pages
collecting for clinical+trial from page 155 of 300 pages
collecting for clinical+trial from page 156 of 300 pages
collecting for clinical+trial from page 157 of 300 pages
collecting for clinical+trial from page 158 of 300 pages
collecting for clinical+trial from page 159 of 300 pages
collecting for clinical+trial from page 160 of 300 pages
collecting for clinical+trial from page 161 of 300 pages
collecting for clinical+trial from page 162 of 300 pages
collecting for clinical+trial from page 163 of 300 pages
collecting for clinical+trial f

collecting for clinical+trial from page 291 of 300 pages
collecting for clinical+trial from page 292 of 300 pages
collecting for clinical+trial from page 293 of 300 pages
collecting for clinical+trial from page 294 of 300 pages
collecting for clinical+trial from page 295 of 300 pages
collecting for clinical+trial from page 296 of 300 pages
collecting for clinical+trial from page 297 of 300 pages
collecting for clinical+trial from page 298 of 300 pages
collecting for clinical+trial from page 299 of 300 pages
collecting for clinical+trial from page 300 of 300 pages
Number of unique urls: 1655, number of all urls collected: 4497


In [14]:
# Collect civil+engineer job urls for Seattle

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Seattle'
job_title_list = ['civil+engineer']
state = 'wa'

seattle_urls_neg = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('seattle_urls_civil_engineer_111319.pkl', 'wb')
pickle.dump(seattle_urls_neg, pickling_in)

collecting for civil+engineer from page 1 of 300 pages
collecting for civil+engineer from page 2 of 300 pages
collecting for civil+engineer from page 3 of 300 pages
collecting for civil+engineer from page 4 of 300 pages
collecting for civil+engineer from page 5 of 300 pages
collecting for civil+engineer from page 6 of 300 pages
collecting for civil+engineer from page 7 of 300 pages
collecting for civil+engineer from page 8 of 300 pages
collecting for civil+engineer from page 9 of 300 pages
collecting for civil+engineer from page 10 of 300 pages
collecting for civil+engineer from page 11 of 300 pages
collecting for civil+engineer from page 12 of 300 pages
collecting for civil+engineer from page 13 of 300 pages
collecting for civil+engineer from page 14 of 300 pages
collecting for civil+engineer from page 15 of 300 pages
collecting for civil+engineer from page 16 of 300 pages
collecting for civil+engineer from page 17 of 300 pages
collecting for civil+engineer from page 18 of 300 pages
c

collecting for civil+engineer from page 147 of 300 pages
collecting for civil+engineer from page 148 of 300 pages
collecting for civil+engineer from page 149 of 300 pages
collecting for civil+engineer from page 150 of 300 pages
collecting for civil+engineer from page 151 of 300 pages
collecting for civil+engineer from page 152 of 300 pages
collecting for civil+engineer from page 153 of 300 pages
collecting for civil+engineer from page 154 of 300 pages
collecting for civil+engineer from page 155 of 300 pages
collecting for civil+engineer from page 156 of 300 pages
collecting for civil+engineer from page 157 of 300 pages
collecting for civil+engineer from page 158 of 300 pages
collecting for civil+engineer from page 159 of 300 pages
collecting for civil+engineer from page 160 of 300 pages
collecting for civil+engineer from page 161 of 300 pages
collecting for civil+engineer from page 162 of 300 pages
collecting for civil+engineer from page 163 of 300 pages
collecting for civil+engineer f

collecting for civil+engineer from page 291 of 300 pages
collecting for civil+engineer from page 292 of 300 pages
collecting for civil+engineer from page 293 of 300 pages
collecting for civil+engineer from page 294 of 300 pages
collecting for civil+engineer from page 295 of 300 pages
collecting for civil+engineer from page 296 of 300 pages
collecting for civil+engineer from page 297 of 300 pages
collecting for civil+engineer from page 298 of 300 pages
collecting for civil+engineer from page 299 of 300 pages
collecting for civil+engineer from page 300 of 300 pages
Number of unique urls: 2761, number of all urls collected: 5547


In [None]:
# Combine WA urls and make sure they are unique:
washington_job_urls = seattle_urls.union(bellevue_urls, seattle_urls_neg)
len(washington_job_urls)

In [None]:
# Pickle:
pickling_in = open('/Users/greenapple/project4/data/interim/washington_job_urls_110519.pkl', 'wb')
pickle.dump(washington_job_urls, pickling_in)

## Collect urls for CA

In [13]:
# Collect data for San Jose:

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'San+Jose'
job_title_list = ['data+scientist',
                  'data+analyst']
state = 'ca'

san_jose_urls = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('/Users/greenapple/project4/data/urls/san_jose_urls_111019.pkl', 'wb')
pickle.dump(san_jose_urls, pickling_in)

collecting for data+scientist from page 1 of 300 pages
collecting for data+scientist from page 2 of 300 pages
collecting for data+scientist from page 3 of 300 pages
collecting for data+scientist from page 4 of 300 pages
collecting for data+scientist from page 5 of 300 pages
collecting for data+scientist from page 6 of 300 pages
collecting for data+scientist from page 7 of 300 pages
collecting for data+scientist from page 8 of 300 pages
collecting for data+scientist from page 9 of 300 pages
collecting for data+scientist from page 10 of 300 pages
collecting for data+scientist from page 11 of 300 pages
collecting for data+scientist from page 12 of 300 pages
collecting for data+scientist from page 13 of 300 pages
collecting for data+scientist from page 14 of 300 pages
collecting for data+scientist from page 15 of 300 pages
collecting for data+scientist from page 16 of 300 pages
collecting for data+scientist from page 17 of 300 pages
collecting for data+scientist from page 18 of 300 pages
c

collecting for data+scientist from page 147 of 300 pages
collecting for data+scientist from page 148 of 300 pages
collecting for data+scientist from page 149 of 300 pages
collecting for data+scientist from page 150 of 300 pages
collecting for data+scientist from page 151 of 300 pages
collecting for data+scientist from page 152 of 300 pages
collecting for data+scientist from page 153 of 300 pages
collecting for data+scientist from page 154 of 300 pages
collecting for data+scientist from page 155 of 300 pages
collecting for data+scientist from page 156 of 300 pages
collecting for data+scientist from page 157 of 300 pages
collecting for data+scientist from page 158 of 300 pages
collecting for data+scientist from page 159 of 300 pages
collecting for data+scientist from page 160 of 300 pages
collecting for data+scientist from page 161 of 300 pages
collecting for data+scientist from page 162 of 300 pages
collecting for data+scientist from page 163 of 300 pages
collecting for data+scientist f

collecting for data+scientist from page 291 of 300 pages
collecting for data+scientist from page 292 of 300 pages
collecting for data+scientist from page 293 of 300 pages
collecting for data+scientist from page 294 of 300 pages
collecting for data+scientist from page 295 of 300 pages
collecting for data+scientist from page 296 of 300 pages
collecting for data+scientist from page 297 of 300 pages
collecting for data+scientist from page 298 of 300 pages
collecting for data+scientist from page 299 of 300 pages
collecting for data+scientist from page 300 of 300 pages
collecting for data+analyst from page 1 of 300 pages
collecting for data+analyst from page 2 of 300 pages
collecting for data+analyst from page 3 of 300 pages
collecting for data+analyst from page 4 of 300 pages
collecting for data+analyst from page 5 of 300 pages
collecting for data+analyst from page 6 of 300 pages
collecting for data+analyst from page 7 of 300 pages
collecting for data+analyst from page 8 of 300 pages
collec

collecting for data+analyst from page 142 of 300 pages
collecting for data+analyst from page 143 of 300 pages
collecting for data+analyst from page 144 of 300 pages
collecting for data+analyst from page 145 of 300 pages
collecting for data+analyst from page 146 of 300 pages
collecting for data+analyst from page 147 of 300 pages
collecting for data+analyst from page 148 of 300 pages
collecting for data+analyst from page 149 of 300 pages
collecting for data+analyst from page 150 of 300 pages
collecting for data+analyst from page 151 of 300 pages
collecting for data+analyst from page 152 of 300 pages
collecting for data+analyst from page 153 of 300 pages
collecting for data+analyst from page 154 of 300 pages
collecting for data+analyst from page 155 of 300 pages
collecting for data+analyst from page 156 of 300 pages
collecting for data+analyst from page 157 of 300 pages
collecting for data+analyst from page 158 of 300 pages
collecting for data+analyst from page 159 of 300 pages
collecting

collecting for data+analyst from page 291 of 300 pages
collecting for data+analyst from page 292 of 300 pages
collecting for data+analyst from page 293 of 300 pages
collecting for data+analyst from page 294 of 300 pages
collecting for data+analyst from page 295 of 300 pages
collecting for data+analyst from page 296 of 300 pages
collecting for data+analyst from page 297 of 300 pages
collecting for data+analyst from page 298 of 300 pages
collecting for data+analyst from page 299 of 300 pages
collecting for data+analyst from page 300 of 300 pages
Number of unique urls: 5221, number of all urls collected: 10078


In [14]:
len(san_jose_urls)

5221

In [None]:
# Collect data for South San Francisco:

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'South+San+Francisco'
job_title_list = ['data+scientist',
                  'data+analyst']
state = 'ca'

south_sf_urls = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('south_sf_urls_110519.pkl', 'wb')
pickle.dump(south_sf_urls, pickling_in)

In [None]:
len(south_sf_urls)

In [None]:
# Collect data for Palo Alto:

# Scraping parameters:
web_site = 'https://www.indeed.com/'
city = 'Palo+Alto'
job_title_list = ['data+scientist',
                  'data+analyst']
state = 'ca'

palo_alto_urls = scrape_urls(web_site, city, job_title_list, state)

# Pickle url list:
pickling_in = open('palo_alto_urls_110519.pkl', 'wb')
pickle.dump(palo_alto_urls, pickling_in)

In [None]:
len(palo_alto_urls)

In [None]:
# Combine CA urls and make sure they are unique:
california_job_urls = san_jose_urls.union(san_jose_urls, palo_alto_urls)
len(california_job_urls)