In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup
import time

sns.set_style('whitegrid')
sns.set(rc={"figure.figsize": (15, 8)})

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

<div style="width:900px;background:#F9EECF;border:1px solid black;text-align:left;padding:8px;">



<p>
<span style="font-size:14pt"><b>Scraping from indeed.com</span></b>

</div>

In [2]:
# Scraping from indeed.com

country_url = {"SG":"https://www.indeed.com.sg/jobs",
       "US":"https://www.indeed.com/jobs",
       "MY":"https://www.indeed.com.my/jobs",
       "HK":"https://www.indeed.hk/jobs",
       "ID":"https://id.indeed.com/lowongan-kerja"
       
       }

countries = {"SG":"Singapore","US":"United States","MY":"Malaysia","HK":"Hong Kong","ID":'Indonesia'}

target_cities= {'US':
                      ['New York', 'Chicago', 'San Francisco', 'Austin', 'Seattle',
                  'Los Angeles', 'Philadelphia', 'Atlanta', 'Dallas',
                  'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston','Miami'],
                'SG':["Singapore"],
                'MY':['Kuala Lumpur','Johor Bahru','Shah Alam'],
                'HK':['Hong Kong'],
                'ID':['Jakarta','Batam','Surabaya']
                }

job_titles = ['data scientist', 'data analyst','chief data officer','chief information officer',\
              'data engineer','business intelligence','artifical intelligence','machine learning'\
             'data consultant','marketing analyst','marketing intelligence','deep learning',\
             'chatbot','system analyst','data crawling','data entry','data administrator',\
             'nlp','data analytics','data']

In [None]:
def scrape_title(chunk): 
    
    job_titles = []
    
    try:
        title = chunk.find('h2',{'class':'jobtitle'}).get_text()
        job_titles.append(title)
    except:
        job_titles.append(np.nan)
        
    return job_titles

def scrape_company(chunk):
    
    companies = []
    
    try:
        company = chunk.find('span',{'class':'company'}).get_text()
        companies.append(company)
    except:
        companies.append(np.nan)
        
    return companies

def scrape_location(chunk):
    
    locations = []
    
    try:
        location = chunk.find('span',{'class':'location'}).get_text()
        locations.append(location)
    except:
        locations.append(np.nan)
        
    return locations

def scrape_review(chunk):
    
    reviews = []
    
    try:
        review = chunk.find('span',{'class':'slNoUnderline'}).get_text()
        review_ = int(review.replace(' reviews','').replace(',',''))
        reviews.append(review_)
    except:
        reviews.append(np.nan)
        
    return reviews

def scrape_rating(chunk):
    
    ratings = []
    
    try:
        rating = chunk.find('span',{'class':'rating'})
        ratings.append(float(rating.get('style').replace('width:','').replace('px','')))
        
    except:
        ratings.append(np.nan)
        
    return ratings

def scrape_salary(chunk):
    
    salaries = []
    
    #get_extra = soup.findAll("div", {"data-tn-component":"organicJob"})

    #for chunk in get_extra:

    try:
        if '$' in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            salary_range = chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore')
            #salary = re.findall(r'\d+',salary_range.replace(',',''))

            #if 'hour' in salary_range:
            #    salary = salary * 2080
            #elif 'day' in salary_range:
            #    salary = salary * 260
            #elif 'week' in salary_range:
            #    salary = salary * 52
            salaries.append(salary_range)

            #salaries.append(np.mean([float(s) for s in salary]))
    except:
        salaries.append(np.nan)
        
    return salaries
            
def scrape_jobtype(chunk):
    
    job_types = []
    
    #get_extra = soup.findAll("div", {"data-tn-component":"organicJob"})

    #for chunk in soup:
    try:
        if '$' not in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            job_types.append(chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'))
        else:
            job_types.append(np.nan)
                        
    except:
        job_types.append(np.nan)
        
    return job_types
            
def scrape_summary(chunk):
    
    summaries = []

    try:
        summaries.append(chunk.find("span", {"id":"job_summary"}).get_text())
    except:
        summaries.append(np.nan)
            
    return summaries

In [None]:
# find max pages

url = 'https://www.indeed.com/jobs?q=data&start='
response = requests.get(url)
html = response.text
max_soup = BeautifulSoup(html, 'lxml')

max_page = int(max_soup.find('div',{'id':'searchCount'}).text.split(' ')[-1].replace(',',''))

if max_page % 10 == 0:
    max_page = max_page/10
else:
    max_page = max_page/10 + 1
    

job_titles_ = []
companies_ = []
locations_ = []
reviews_ = []
salaries_ = []
job_types_ = []
ratings_ = []

for i in range(0,max_page,10):
    
    url = url+str(i)
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml').find_all('div', {"data-tn-component":"organicJob"})
    
    for chunk in soup:
    
        job_titles = job_titles_.extend(scrape_title(chunk))
        companies = companies_.extend(scrape_company(chunk))
        locations = locations_.extend(scrape_location(chunk))
        reviews = reviews_.extend(scrape_review(chunk))
        salaries = salaries_.extend(scrape_salary(chunk))
        job_types = job_types_.extend(scrape_jobtype(chunk))
        ratings = ratings_.extend(scrape_rating(chunk))

    print '......... Number of jobs scraped ' + str(i + 10)+ ' .........'

    time.sleep(3)

results = pd.DataFrame()

results['job_titles'] = job_titles_
results['companies'] = companies_
results['locations'] = locations_
results['reviews'] = reviews_
results['salaries'] = salaries_
results['job_types'] = job_types_
results['ratings'] = ratings_

results.head(20)

......... Number of jobs scraped 10 .........
......... Number of jobs scraped 20 .........
......... Number of jobs scraped 30 .........
......... Number of jobs scraped 40 .........
......... Number of jobs scraped 50 .........
......... Number of jobs scraped 60 .........
......... Number of jobs scraped 70 .........
......... Number of jobs scraped 80 .........
......... Number of jobs scraped 90 .........
......... Number of jobs scraped 100 .........
......... Number of jobs scraped 110 .........
......... Number of jobs scraped 120 .........
......... Number of jobs scraped 130 .........
......... Number of jobs scraped 140 .........
......... Number of jobs scraped 150 .........
......... Number of jobs scraped 160 .........
......... Number of jobs scraped 170 .........
......... Number of jobs scraped 180 .........
......... Number of jobs scraped 190 .........
......... Number of jobs scraped 200 .........
......... Number of jobs scraped 210 .........
......... Number of jo