In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

import requests
from bs4 import BeautifulSoup
import time

sns.set_style('whitegrid')
sns.set(rc={"figure.figsize": (15, 8)})

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

<div style="width:900px;background:#F9EECF;border:1px solid black;text-align:left;padding:8px;">



<p>
<span style="font-size:14pt"><b>Scraping from indeed.com</span></b>

</div>

In [2]:
# Scraping from indeed.com

country_url = {"SG":"https://www.indeed.com.sg/jobs",
       "US":"https://www.indeed.com/jobs",
       "MY":"https://www.indeed.com.my/jobs",
       "HK":"https://www.indeed.hk/jobs",
       "ID":"https://id.indeed.com/lowongan-kerja"
       
       }

countries = {"SG":"Singapore","US":"United States","MY":"Malaysia","HK":"Hong Kong","ID":'Indonesia'}

target_cities= {'US':
                      ['New York', 'Chicago', 'San Francisco', 'Austin', 'Seattle',
                  'Los Angeles', 'Philadelphia', 'Atlanta', 'Dallas',
                  'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston','Miami'],
                'SG':["Singapore"],
                'MY':['Kuala Lumpur','Johor Bahru','Shah Alam'],
                'HK':['Hong Kong'],
                'ID':['Jakarta','Batam','Surabaya']
                }

job_titles = ['data scientist', 'data analyst','chief data officer','chief information officer',\
              'data engineer','business intelligence','artifical intelligence','machine learning'\
             'data consultant','marketing analyst','marketing intelligence','deep learning',\
             'chatbot','system analyst','data crawling','data entry','data administrator',\
             'nlp','data analytics','data']

In [3]:
def scrape_title(chunk): 
    
    job_titles = []
    
    try:
        title = chunk.find('h2',{'class':'jobtitle'}).get_text()
        job_titles.append(title)
    except:
        job_titles.append(np.nan)
        
    return job_titles

def scrape_company(chunk):
    
    companies = []
    
    try:
        company = chunk.find('span',{'class':'company'}).get_text()
        companies.append(company)
    except:
        companies.append(np.nan)
        
    return companies

def scrape_location(chunk):
    
    locations = []
    
    try:
        location = chunk.find('span',{'class':'location'}).get_text()
        locations.append(location)
    except:
        locations.append(np.nan)
        
    return locations

def scrape_review(chunk):
    
    reviews = []
    
    try:
        review = chunk.find('span',{'class':'slNoUnderline'}).get_text()
        review_ = int(review.replace(' reviews','').replace(',',''))
        reviews.append(review_)
    except:
        reviews.append(np.nan)
        
    return reviews

def scrape_rating(chunk):
    
    ratings = []
    
    try:
        rating = chunk.find('span',{'class':'rating'})
        ratings.append(float(rating.get('style').replace('width:','').replace('px','')))
        
    except:
        ratings.append(np.nan)
        
    return ratings

def scrape_salary(chunk):
    
    salaries = []
    
    #get_extra = soup.findAll("div", {"data-tn-component":"organicJob"})

    #for chunk in get_extra:

    try:
        if '$' in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            salary_range = chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore')
            #salary = re.findall(r'\d+',salary_range.replace(',',''))

            #if 'hour' in salary_range:
            #    salary = salary * 2080
            #elif 'day' in salary_range:
            #    salary = salary * 260
            #elif 'week' in salary_range:
            #    salary = salary * 52
            salaries.append(salary_range)

            #salaries.append(np.mean([float(s) for s in salary]))
    except:
        salaries.append(np.nan)
        
    return salaries
            
def scrape_jobtype(chunk):
    
    job_types = []
    
    #get_extra = soup.findAll("div", {"data-tn-component":"organicJob"})

    #for chunk in soup:
    try:
        if '$' not in chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'):
            job_types.append(chunk.find('span',{'class':'no-wrap'}).get_text().encode('ascii','ignore'))
        else:
            job_types.append(np.nan)
                        
    except:
        job_types.append(np.nan)
        
    return job_types
            
def scrape_summary(chunk):
    
    summaries = []

    try:
        summaries.append(chunk.find("span", {"id":"job_summary"}).get_text())
    except:
        summaries.append(np.nan)
            
    return summaries

In [4]:
# find max pages

url = 'https://www.indeed.com/jobs?q=data&start='
response = requests.get(url)
html = response.text
max_soup = BeautifulSoup(html, 'lxml')

max_page = int(max_soup.find('div',{'id':'searchCount'}).text.split(' ')[-1].replace(',',''))

if max_page % 10 == 0:
    max_page = max_page/10
else:
    max_page = max_page/10 + 1
    

job_titles_ = []
companies_ = []
locations_ = []
reviews_ = []
salaries_ = []
job_types_ = []
ratings_ = []

for i in range(0,3490,10):
    
    url = url+str(i)
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml').find_all('div', {"data-tn-component":"organicJob"})
    
    for chunk in soup:
    
        job_titles = job_titles_.extend(scrape_title(chunk))
        companies = companies_.extend(scrape_company(chunk))
        locations = locations_.extend(scrape_location(chunk))
        reviews = reviews_.extend(scrape_review(chunk))
        salaries = salaries_.extend(scrape_salary(chunk))
        job_types = job_types_.extend(scrape_jobtype(chunk))
        ratings = ratings_.extend(scrape_rating(chunk))

    print '......... Number of jobs scraped ' + str(i + 10)+ ' .........'

    time.sleep(3)

results = pd.DataFrame()

results['job_titles'] = job_titles_
results['companies'] = companies_
results['locations'] = locations_
results['reviews'] = reviews_
results['salaries'] = salaries_
results['job_types'] = job_types_
results['ratings'] = ratings_

results.head(20)

......... Number of jobs scraped 10 .........
......... Number of jobs scraped 20 .........
......... Number of jobs scraped 30 .........
......... Number of jobs scraped 40 .........
......... Number of jobs scraped 50 .........
......... Number of jobs scraped 60 .........
......... Number of jobs scraped 70 .........
......... Number of jobs scraped 80 .........
......... Number of jobs scraped 90 .........
......... Number of jobs scraped 100 .........
......... Number of jobs scraped 110 .........
......... Number of jobs scraped 120 .........
......... Number of jobs scraped 130 .........
......... Number of jobs scraped 140 .........
......... Number of jobs scraped 150 .........
......... Number of jobs scraped 160 .........
......... Number of jobs scraped 170 .........
......... Number of jobs scraped 180 .........
......... Number of jobs scraped 190 .........
......... Number of jobs scraped 200 .........
......... Number of jobs scraped 210 .........
......... Number of jo

......... Number of jobs scraped 1740 .........
......... Number of jobs scraped 1750 .........
......... Number of jobs scraped 1760 .........
......... Number of jobs scraped 1770 .........
......... Number of jobs scraped 1780 .........
......... Number of jobs scraped 1790 .........
......... Number of jobs scraped 1800 .........
......... Number of jobs scraped 1810 .........
......... Number of jobs scraped 1820 .........
......... Number of jobs scraped 1830 .........
......... Number of jobs scraped 1840 .........
......... Number of jobs scraped 1850 .........
......... Number of jobs scraped 1860 .........
......... Number of jobs scraped 1870 .........
......... Number of jobs scraped 1880 .........
......... Number of jobs scraped 1890 .........
......... Number of jobs scraped 1900 .........
......... Number of jobs scraped 1910 .........
......... Number of jobs scraped 1920 .........
......... Number of jobs scraped 1930 .........
......... Number of jobs scraped 1940 ..

......... Number of jobs scraped 3450 .........
......... Number of jobs scraped 3460 .........
......... Number of jobs scraped 3470 .........
......... Number of jobs scraped 3480 .........
......... Number of jobs scraped 3490 .........


Unnamed: 0,job_titles,companies,locations,reviews,salaries,job_types,ratings
0,\nData Scientist Intern\n,\n\nNIKE INC,"Beaverton, OR",4325.0,,,52.2
1,\nData Science Intern\n,\n\nExpedia,"Bellevue, WA 98004 (Downtown area)",467.0,,,51.0
2,"\nAnalyst, Reporting & Analytics\n",\n\nT-Mobile,"Bellevue, WA 98006 (Somerset area)",6697.0,,,51.0
3,\nAnalytics Intern - Central Technology - Boul...,\n\nActivision,"Boulder, CO",59.0,,,51.6
4,\nContract Support Assistant\n,\n\nCubic Corporation,Remote,138.0,,,42.6
5,\nData Analyst (Remote)\n,\nFirst San Francisco Partners,Remote,,,,
6,\nData Conversion Specialist\n,\n\nPaylocity,Remote,49.0,,,41.4
7,\nData Analytics Co-Op - Advanced Degrees\n,\n\nAT&T,California,19458.0,,,44.4
8,\nData Scientist Intern\n,\n\nIllumina,"San Diego, CA 92122",99.0,,,42.6
9,\nData Analyst (PCMH)\n,\n\nProspect Medical Systems,Rhode Island,32.0,,,31.8


In [5]:
results.to_pickle('results.pickle')

In [11]:
# find max pages

url = 'https://www.indeed.com/jobs?q=data&start='
response = requests.get(url)
html = response.text
max_soup = BeautifulSoup(html, 'lxml')

#max_page = int(max_soup.find('div',{'id':'searchCount'}).text.split(' ')[-1].replace(',',''))

#if max_page % 10 == 0:
#    max_page = max_page/10
#else:
#    max_page = max_page/10 + 1
    

job_titles_ = []
companies_ = []
locations_ = []
reviews_ = []
salaries_ = []
job_types_ = []
ratings_ = []

for i in range(3500,6000,10):
    
    url = url+str(i)
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml').find_all('div', {"data-tn-component":"organicJob"})
    
    for chunk in soup:
    
        job_titles = job_titles_.extend(scrape_title(chunk))
        companies = companies_.extend(scrape_company(chunk))
        locations = locations_.extend(scrape_location(chunk))
        reviews = reviews_.extend(scrape_review(chunk))
        salaries = salaries_.extend(scrape_salary(chunk))
        job_types = job_types_.extend(scrape_jobtype(chunk))
        ratings = ratings_.extend(scrape_rating(chunk))

    print '......... Number of jobs scraped ' + str(i + 10)+ ' .........'

    time.sleep(3)

results = pd.DataFrame()

results['job_titles'] = job_titles_
results['companies'] = companies_
results['locations'] = locations_
results['reviews'] = reviews_
results['salaries'] = salaries_
results['job_types'] = job_types_
results['ratings'] = ratings_

results.head(20)

......... Number of jobs scraped 3510 .........
......... Number of jobs scraped 3520 .........
......... Number of jobs scraped 3530 .........
......... Number of jobs scraped 3540 .........
......... Number of jobs scraped 3550 .........
......... Number of jobs scraped 3560 .........
......... Number of jobs scraped 3570 .........
......... Number of jobs scraped 3580 .........
......... Number of jobs scraped 3590 .........
......... Number of jobs scraped 3600 .........
......... Number of jobs scraped 3610 .........
......... Number of jobs scraped 3620 .........
......... Number of jobs scraped 3630 .........
......... Number of jobs scraped 3640 .........
......... Number of jobs scraped 3650 .........
......... Number of jobs scraped 3660 .........
......... Number of jobs scraped 3670 .........
......... Number of jobs scraped 3680 .........
......... Number of jobs scraped 3690 .........
......... Number of jobs scraped 3700 .........
......... Number of jobs scraped 3710 ..

......... Number of jobs scraped 5220 .........
......... Number of jobs scraped 5230 .........
......... Number of jobs scraped 5240 .........
......... Number of jobs scraped 5250 .........
......... Number of jobs scraped 5260 .........
......... Number of jobs scraped 5270 .........
......... Number of jobs scraped 5280 .........
......... Number of jobs scraped 5290 .........
......... Number of jobs scraped 5300 .........
......... Number of jobs scraped 5310 .........
......... Number of jobs scraped 5320 .........
......... Number of jobs scraped 5330 .........
......... Number of jobs scraped 5340 .........
......... Number of jobs scraped 5350 .........
......... Number of jobs scraped 5360 .........
......... Number of jobs scraped 5370 .........
......... Number of jobs scraped 5380 .........
......... Number of jobs scraped 5390 .........
......... Number of jobs scraped 5400 .........
......... Number of jobs scraped 5410 .........
......... Number of jobs scraped 5420 ..

Unnamed: 0,job_titles,companies,locations,reviews,salaries,job_types,ratings
0,\nMCM Analytics and Reporting\n,\n\nQuintilesIMS,"Plymouth Meeting, PA",913.0,,,44.4
1,\nMES Application Analyst\n,\n\nFinisar,"Fremont, CA 94538 (Irvington area)",47.0,,,43.2
2,\nJunior Analyst\n,\n\nMorgan Borszcz Consulting,"Cherry Point, NC",11.0,,,51.0
3,\nData Analyst\n,\n\nPlanSource,"Orlando, FL",23.0,,,41.4
4,\nJr. Data Analyst and Programmer (Health Cons...,\n\nLogistics Management Institute,"Tysons, VA",22.0,,,51.0
5,\nData Analyst\n,\n\nGroup O,"San Antonio, TX",85.0,,,40.2
6,\nAssociate Data Scientist\n,\n\nLimeade,"Bellevue, WA 98004 (Downtown area)",6.0,,,30.0
7,\nData Processing Operator - Optics\n,\n\nHarris Corporation,"Kekaha, HI 96752",353.0,,,42.6
8,\nData Administrator\n,\n\nNextgen Technologies,United States,2.0,,,39.0
9,\nBusiness Analyst ? Capital Markets Analytics...,\n\nFirst Guaranty Mortgage Corporation,"Plano, TX 75023",22.0,,,32.4


In [12]:
results.to_pickle('results_2.pickle')

In [None]:
time.sleep(60)

In [14]:
results.salaries.isnull().sum()

2496

In [15]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 7 columns):
job_titles    2500 non-null object
companies     2500 non-null object
locations     2500 non-null object
reviews       2247 non-null float64
salaries      4 non-null object
job_types     0 non-null float64
ratings       2247 non-null float64
dtypes: float64(3), object(4)
memory usage: 136.8+ KB
