In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup
import time

sns.set_style('whitegrid')
sns.set(rc={"figure.figsize": (15, 8)})

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

<div style="width:900px;background:#F9EECF;border:1px solid black;text-align:left;padding:8px;">



<p>
<span style="font-size:14pt"><b>Scraping from indeed.com</span></b>

</div>

In [2]:
from selenium import webdriver

driver = webdriver.Chrome(executable_path="./chromedriver/chromedriver")

In [3]:
def scrape_title(chunk): 
    
    job_titles = []
    
    try:
        title = chunk.find('h2',{'class':'jobtitle'}).get_text()
        job_titles.append(title)
    except:
        job_titles.append(np.nan)
        
    return job_titles

def scrape_company(chunk):
    
    companies = []
    
    try:
        company = chunk.find('span',{'class':'company'}).get_text()
        companies.append(company)
    except:
        companies.append(np.nan)
        
    return companies

def scrape_location(chunk):
    
    locations = []
    
    try:
        location = chunk.find('span',{'class':'location'}).get_text()
        locations.append(location)
    except:
        locations.append(np.nan)
        
    return locations

def scrape_review(chunk):
    
    reviews = []
    
    try:
        review = chunk.find('span',{'class':'slNoUnderline'}).get_text()
        review_ = int(review.replace(' reviews','').replace(',',''))
        reviews.append(review_)
    except:
        reviews.append(np.nan)
        
    return reviews

def scrape_rating(chunk):
    
    ratings = []
    
    try:
        rating = chunk.find('span',{'class':'rating'})
        ratings.append(float(rating.get('style').replace('width:','').replace('px','')))
        
    except:
        ratings.append(np.nan)
        
    return ratings

def scrape_salary(chunk):
    
    salaries = []
    
    #get_extra = soup.findAll("div", {"data-tn-component":"organicJob"})

    #for chunk in get_extra:

    try:
        if '$' in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            salary_range = chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore')
            #salary = re.findall(r'\d+',salary_range.replace(',',''))

            #if 'hour' in salary_range:
            #    salary = salary * 2080
            #elif 'day' in salary_range:
            #    salary = salary * 260
            #elif 'week' in salary_range:
            #    salary = salary * 52
            salaries.append(salary_range)

            #salaries.append(np.mean([float(s) for s in salary]))
    except:
        salaries.append(np.nan)
        
    return salaries
            
def scrape_jobtype(chunk):
    
    job_types = []
    
    try:
        if '$' not in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            job_types.append(chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'))
        else:
            job_types.append(np.nan)
                        
    except:
        job_types.append(np.nan)
        
    return job_types
            
def scrape_summary(chunk):
    
    summaries = []

    try:
        summaries.append(chunk.find("span", {"class":"summary"}).get_text())
    except:
        summaries.append(np.nan)
            
    return summaries

In [9]:
# Scrapper

In [10]:
pages = [range(0,1500,10)]
salary_amt = ['100']

for amt in salary_amt:
    
    for ind, batch in enumerate(pages):
        
        job_titles_ = []
        companies_ = []
        locations_ = []
        reviews_ = []
        salaries_ = []
        ratings_ = []
        summaries_ = []
        
        for i in batch:
    
            driver.get('https://www.indeed.com/jobs?q=data+%24'+str(amt)+'%2C000&start='+str(i))
            time.sleep(3)
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml').find_all('div', {"data-tn-component":"organicJob"})

            for chunk in soup:

                job_titles = job_titles_.extend(scrape_title(chunk))
                companies = companies_.extend(scrape_company(chunk))
                locations = locations_.extend(scrape_location(chunk))
                reviews = reviews_.extend(scrape_review(chunk))
                salaries = salaries_.extend(scrape_salary(chunk))
                ratings = ratings_.extend(scrape_rating(chunk))
                summaries = summaries_.extend(scrape_summary(chunk))

            print '......... Number of jobs scraped ' + str(i + 10)+ ' .........'

            time.sleep(3)

        results = pd.DataFrame()

        results['job_titles'] = job_titles_
        results['companies'] = companies_
        results['locations'] = locations_
        results['reviews'] = reviews_
        results['salaries'] = salaries_
        results['ratings'] = ratings_
        results['summaries'] = summaries_
        
        results.to_pickle('results_'+str(amt)+'000_'+str(ind)+'.pickle')
        
        time.sleep(60)

......... Number of jobs scraped 10 .........
......... Number of jobs scraped 20 .........
......... Number of jobs scraped 30 .........
......... Number of jobs scraped 40 .........
......... Number of jobs scraped 50 .........
......... Number of jobs scraped 60 .........
......... Number of jobs scraped 70 .........
......... Number of jobs scraped 80 .........
......... Number of jobs scraped 90 .........
......... Number of jobs scraped 100 .........
......... Number of jobs scraped 110 .........
......... Number of jobs scraped 120 .........
......... Number of jobs scraped 130 .........
......... Number of jobs scraped 140 .........
......... Number of jobs scraped 150 .........
......... Number of jobs scraped 160 .........
......... Number of jobs scraped 170 .........
......... Number of jobs scraped 180 .........
......... Number of jobs scraped 190 .........
......... Number of jobs scraped 200 .........
......... Number of jobs scraped 210 .........
......... Number of jo

In [4]:
pages = [range(0,1000,10)]
salary_amt = ['20','40','60','80','100']

for amt in salary_amt:
    
    for ind, batch in enumerate(pages):
        
        job_titles_ = []
        companies_ = []
        locations_ = []
        reviews_ = []
        salaries_ = []
        ratings_ = []
        summaries_ = []
        
        for i in batch:
    
            driver.get('https://www.indeed.com/jobs?q=data+science+%24'+str(amt)+'000&start='+str(i))
            time.sleep(3)
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml').find_all('div', {"data-tn-component":"organicJob"})

            for chunk in soup:

                job_titles = job_titles_.extend(scrape_title(chunk))
                companies = companies_.extend(scrape_company(chunk))
                locations = locations_.extend(scrape_location(chunk))
                reviews = reviews_.extend(scrape_review(chunk))
                salaries = salaries_.extend(scrape_salary(chunk))
                ratings = ratings_.extend(scrape_rating(chunk))
                summaries = summaries_.extend(scrape_summary(chunk))

            print '......... Number of jobs scraped ' + str(i + 10)+ ' .........'

            time.sleep(1)

        results = pd.DataFrame()

        results['job_titles'] = job_titles_
        results['companies'] = companies_
        results['locations'] = locations_
        results['reviews'] = reviews_
        results['salaries'] = salaries_
        results['ratings'] = ratings_
        results['summaries'] = summaries_
        
        results.to_pickle('results_'+str(amt)+'000_'+str(ind)+'.pickle')
        
        time.sleep(60)

......... Number of jobs scraped 10 .........
......... Number of jobs scraped 20 .........
......... Number of jobs scraped 30 .........
......... Number of jobs scraped 40 .........
......... Number of jobs scraped 50 .........
......... Number of jobs scraped 60 .........
......... Number of jobs scraped 70 .........
......... Number of jobs scraped 80 .........
......... Number of jobs scraped 90 .........
......... Number of jobs scraped 100 .........
......... Number of jobs scraped 110 .........
......... Number of jobs scraped 120 .........
......... Number of jobs scraped 130 .........
......... Number of jobs scraped 140 .........
......... Number of jobs scraped 150 .........
......... Number of jobs scraped 160 .........
......... Number of jobs scraped 170 .........
......... Number of jobs scraped 180 .........
......... Number of jobs scraped 190 .........
......... Number of jobs scraped 200 .........
......... Number of jobs scraped 210 .........
......... Number of jo

......... Number of jobs scraped 760 .........
......... Number of jobs scraped 770 .........
......... Number of jobs scraped 780 .........
......... Number of jobs scraped 790 .........
......... Number of jobs scraped 800 .........
......... Number of jobs scraped 810 .........
......... Number of jobs scraped 820 .........
......... Number of jobs scraped 830 .........
......... Number of jobs scraped 840 .........
......... Number of jobs scraped 850 .........
......... Number of jobs scraped 860 .........
......... Number of jobs scraped 870 .........
......... Number of jobs scraped 880 .........
......... Number of jobs scraped 890 .........
......... Number of jobs scraped 900 .........
......... Number of jobs scraped 910 .........
......... Number of jobs scraped 920 .........
......... Number of jobs scraped 930 .........
......... Number of jobs scraped 940 .........
......... Number of jobs scraped 950 .........
......... Number of jobs scraped 960 .........
......... Num

......... Number of jobs scraped 510 .........
......... Number of jobs scraped 520 .........
......... Number of jobs scraped 530 .........
......... Number of jobs scraped 540 .........
......... Number of jobs scraped 550 .........
......... Number of jobs scraped 560 .........
......... Number of jobs scraped 570 .........
......... Number of jobs scraped 580 .........
......... Number of jobs scraped 590 .........
......... Number of jobs scraped 600 .........
......... Number of jobs scraped 610 .........
......... Number of jobs scraped 620 .........
......... Number of jobs scraped 630 .........
......... Number of jobs scraped 640 .........
......... Number of jobs scraped 650 .........
......... Number of jobs scraped 660 .........
......... Number of jobs scraped 670 .........
......... Number of jobs scraped 680 .........
......... Number of jobs scraped 690 .........
......... Number of jobs scraped 700 .........
......... Number of jobs scraped 710 .........
......... Num