## Scraping Data Scientist jobs from Seek.com.au using Beautiful Soup

In [1]:
# Importing libraries
import re
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests
from bs4 import BeautifulSoup
import json
from time import sleep

In [2]:
# Choosing two salary ranges for each job, which are the equivalent of Low and High
salary_ranges = ['50000-80000', '80000-120000', '120000-150000']

# Dictionary to store all the web links
job_urls = {}

for salary_range in salary_ranges:
    job_urls[salary_range]=[]
    for page in range(1,40):
        # iterate through the first 40 pages of jobs
        r = requests.get("https://www.seek.com.au/business-intelligence-analyst-jobs/in-All-Sydney-NSW?page={0}&salaryrange={1}&salarytype=annual".format(str(page),salary_range))

        # turn into a BeautifulSoup object
        soup = BeautifulSoup(r.text, 'lxml')

        # find urls on page
        url = soup.find_all(attrs={'data-automation':"jobTitle"})
        /
        if url != []: 
            job_urls[salary_range].extend(url)

In [3]:
df_1 = pd.DataFrame()
df_1['Link'] = job_urls['50000-80000']
df_1['Salary Range'] = '50000-80000'

In [4]:
df_1.head()

Unnamed: 0,Link,Salary Range
0,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000
1,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000
2,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000
3,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000
4,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000


In [5]:
df_2 = pd.DataFrame()
df_2['Link'] = job_urls['80000-120000']
df_2['Salary Range'] = '80000-120000'

In [6]:
df_3 = pd.DataFrame()
df_3['Link'] = job_urls['120000-150000']
df_3['Salary Range'] = '120000-150000'

In [7]:
url_pd = df_1.append(df_2).append(df_3)

In [8]:
url_pd.tail()

Unnamed: 0,Link,Salary Range
335,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",120000-150000
336,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",120000-150000
337,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",120000-150000
338,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",120000-150000
339,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",120000-150000


In [9]:
url_pd['Link_1'] = ['https://www.seek.com.au' + re.search('(\/job\/[0-9]{8})',i.attrs['href']).group(0) for i in url_pd['Link']]

In [10]:
url_pd.head()

Unnamed: 0,Link,Salary Range,Link_1
0,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000,https://www.seek.com.au/job/40099652
1,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000,https://www.seek.com.au/job/40089444
2,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000,https://www.seek.com.au/job/40001520
3,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000,https://www.seek.com.au/job/40083452
4,"<a class=""_2iNL7wI"" data-automation=""jobTitle""...",50000-80000,https://www.seek.com.au/job/40099652


In [11]:
url_pd['Title'] = [i.text for i in url_pd['Link']]

In [12]:
url_pd = url_pd.drop('Link',axis='columns')

In [13]:
url_pd.head()

Unnamed: 0,Salary Range,Link_1,Title
0,50000-80000,https://www.seek.com.au/job/40099652,Business Intelligence Analyst - Australia's #1...
1,50000-80000,https://www.seek.com.au/job/40089444,Experienced Actuarial and Analytics Consultant...
2,50000-80000,https://www.seek.com.au/job/40001520,Business Analysis/Business Intelligence (BA/BI...
3,50000-80000,https://www.seek.com.au/job/40083452,Business Intelligence Reporting Analyst
4,50000-80000,https://www.seek.com.au/job/40099652,Business Intelligence Analyst - Australia's #1...


In [14]:
columns = ['Salary Range', 'Link', "Title"]
url_pd.columns=columns

In [15]:
url_pd.head()

Unnamed: 0,Salary Range,Link,Title
0,50000-80000,https://www.seek.com.au/job/40099652,Business Intelligence Analyst - Australia's #1...
1,50000-80000,https://www.seek.com.au/job/40089444,Experienced Actuarial and Analytics Consultant...
2,50000-80000,https://www.seek.com.au/job/40001520,Business Analysis/Business Intelligence (BA/BI...
3,50000-80000,https://www.seek.com.au/job/40083452,Business Intelligence Reporting Analyst
4,50000-80000,https://www.seek.com.au/job/40099652,Business Intelligence Analyst - Australia's #1...


In [16]:
#for stemming
import re

def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

In [17]:
job_listing_date=[]
job_expiry_date=[]
job_title = []
job_teaser=[]
job_advertiser=[]
job_area=[]
job_worktype=[]
job_classification=[]
job_salary=[]
job_salary_type=[]
job_description_details=[]
job_description_strong = []
job_location =[]

for index, job in enumerate(url_pd['Link']):
    # iterate through all the urls
    try:
        r = requests.get(job)
    except:
        time.sleep(5)
    
    # turn into a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'lxml')
    
    # find job descriptions
    jd_data = soup.find_all(attrs={'data-automation':'server-state'})[0]
    jd_raw= jd_data.text[3:].split('\n  ')[1][25:][:-1]
    jd_dict = json.loads(jd_raw)
    jd_dashboard = jd_dict['jobdetails']['result']
    
    if jd_dashboard['listingDate']:
        job_listing_date.append(jd_dashboard['listingDate'])
    else:
        job_listing_date.append('N/A')
        
    if jd_dashboard['expiryDate']:
        job_expiry_date.append(jd_dashboard['expiryDate'])
    else:
        job_expiry_date.append('N/A')
        
    if jd_dashboard['title']:
        job_title.append(jd_dashboard['title'])
    else:
        job_title.append('N/A')
        
    if jd_dashboard['teaser']:
        job_teaser.append(jd_dashboard['teaser'])
    else:
        job_teaser.append('N/A')
        
    if jd_dashboard['advertiser']['description']:
        job_advertiser.append(jd_dashboard['advertiser']['description'])
    else:
        job_advertiser.append('N/A')
        
    if jd_dashboard['locationHierarchy']['area']:
        job_area.append(jd_dashboard['locationHierarchy']['area'])
    else:
        job_area.append('N/A')
        
    if jd_dashboard['workType']:
        job_worktype.append(jd_dashboard['workType'])
    else:
        job_worktype.append('N/A')
        
    if jd_dashboard['classification']['description']:
        job_classification.append(jd_dashboard['classification']['description'])
    else:
        job_classification.append('N/A')
        
    if jd_dashboard['salary']:
        job_salary.append(jd_dashboard['salary'])
    else:
        job_salary.append('N/A')
        
    if jd_dashboard['salaryType']:
        job_salary_type.append(jd_dashboard['salaryType'])
    else:
        job_salary_type.append('N/A')
        
    if jd_dashboard['locationHierarchy']['city']:
        job_location.append(jd_dashboard['locationHierarchy']['city'])
    else:
        job_location.append('N/A')
        
    
    jd_detail = soup.find(attrs={'data-automation':'mobileTemplate'})
    job_description_details.append(jd_detail)
    

In [18]:
url_pd['Listing Date'] = job_listing_date
url_pd['Expiry Date'] =job_expiry_date
url_pd['Job Title']=job_title
url_pd['Job Teaser']=job_teaser
url_pd['Advertiser'] = job_advertiser
url_pd['Area'] = job_area
url_pd['Work Type'] = job_worktype
url_pd['Classification'] = job_classification
url_pd['Salary'] = job_salary
url_pd['Salary Type'] = job_salary_type
url_pd['JD'] = job_description_details
url_pd['Location'] =job_location

In [19]:
# Check the type for 'JD' column and start cleaning
type(job_description_details)

list

In [20]:
job_description_clean=[]
job_description_strong = []
for i in range(0,len(url_pd['JD'])):
    try:
        strong_word = job_description_details[i].find_all('strong')
        strong_word_list = [u.get_text(strip=True) for u in strong_word if strong_word != False]
        job_description_strong.append(strong_word_list)

        string = job_description_details[i].get_text()
        repla = {u'\xa0':'  ',u'\xe2\x80\x9d':'  ', u'\n':'  '}
        job_description_clean.append(replace(string,repla))
    except:
        job_description_strong.append('N/A')
        job_description_clean.append('N/A')

In [21]:
url_pd['Job Description'] = job_description_clean
url_pd['Strong Words'] = job_description_strong 

In [22]:
business_intelligence_jobs = url_pd.copy()

In [23]:
business_intelligence_jobs.to_csv('business_intelligence_jobs.csv')

In [24]:
business_intelligence_jobs.tail(10)

Unnamed: 0,Salary Range,Link,Title,Listing Date,Expiry Date,Job Title,Job Teaser,Advertiser,Area,Work Type,Classification,Salary,Salary Type,JD,Location,Job Description,Strong Words
330,120000-150000,https://www.seek.com.au/job/40097109,Consumer Insights Analyst,2019-10-04T04:50:24.000Z,2019-11-03T12:59:59.000Z,Consumer Insights Analyst,Exciting Commercial role business partnering w...,Talent Web Recruitment,North Shore & Northern Beaches,Full Time,Marketing & Communications,$110000.00 - $120450.00 p.a.,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,North SydneyInfluence Marketing Campaigns with...,"[COMPANY, ROLE, Requirements:]"
331,120000-150000,https://www.seek.com.au/job/40147122,"Senior Insights Analyst - SAS, R or SQL",2019-10-11T06:20:38.000Z,2019-11-10T12:59:59.000Z,"Senior Insights Analyst - SAS, R or SQL",We are currently seeking a Senior Insights Ana...,Bluefin Resources Pty Limited,"CBD, Inner West & Eastern Suburbs",Full Time,Banking & Financial Services,,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,We are currently seeking Insight Analysts & Se...,[]
332,120000-150000,https://www.seek.com.au/job/40120897,"Senior Analyst, Financial Risk Analytics",2019-10-08T23:37:27.000Z,2019-11-07T23:37:26.000Z,"Senior Analyst, Financial Risk Analytics",This pivotal role will see you actively facili...,QBE,"CBD, Inner West & Eastern Suburbs",Full Time,Banking & Financial Services,,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,Financial Risk Analytics| Investment Risk | Gl...,"[Senior Analyst, Financial Risk Analytics, Abo..."
333,120000-150000,https://www.seek.com.au/job/40063017,Consumer Insights Analyst,2019-10-01T00:26:15.000Z,2019-10-31T12:59:59.000Z,Consumer Insights Analyst,Exciting Commercial role business partnering w...,Talent Web Recruitment,North Shore & Northern Beaches,Full Time,Consulting & Strategy,$110000.00 - $120450.00 p.a.,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,North SydneyInfluence Marketing Campaigns with...,"[COMPANY, ROLE, Requirements:]"
334,120000-150000,https://www.seek.com.au/job/40063931,FP&A Analyst,2019-10-01T01:15:42.000Z,2019-10-31T12:59:59.000Z,FP&A Analyst,An exciting opportunity has arisen for an expe...,Talenza,"CBD, Inner West & Eastern Suburbs",Full Time,Banking & Financial Services,$100k - $120k p.a. + Super,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,The role will assist in monitoring of division...,"[Key Responsibilities:, Key Requirements:]"
335,120000-150000,https://www.seek.com.au/job/40158236,AUEXP173061- IAM Technical Support Specialist,2019-10-14T04:47:41.000Z,2019-11-13T12:59:59.000Z,AUEXP173061- IAM Technical Support Specialist,Experienced identity and access management pro...,Deloitte,"CBD, Inner West & Eastern Suburbs",Full Time,Information & Communication Technology,,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,Risk Advisory - Cyber Intelligence Centre - IA...,[]
336,120000-150000,https://www.seek.com.au/job/40098049,Power System Engineer/Analyst/Software Develop...,2019-10-04T05:39:42.000Z,2019-11-03T05:39:41.000Z,Power System Engineer/Analyst/Software Develop...,AEMO are responsible for operating Australia’s...,Australian Energy Market Operator (AEMO),"CBD, Inner West & Eastern Suburbs",Full Time,"Mining, Resources & Energy",,AnnualCommission,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,About the Role: Our Operations team is looking...,"[About the Role, Key accountabilities:, Requir..."
337,120000-150000,https://www.seek.com.au/job/40096110,Superannuation Consultant,2019-10-04T03:54:31.000Z,2019-11-03T12:59:59.000Z,Superannuation Consultant,You will wear 2 hats in this role - providing ...,B & K Consulting,"CBD, Inner West & Eastern Suburbs",Full Time,Insurance & Superannuation,$120k - $140k p.a. + Superannuation,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,Our client is a leading provider of Superannua...,[]
338,120000-150000,https://www.seek.com.au/job/40249953,Bid and Proposal Manager - Cyber Security Serv...,2019-10-25T05:28:00.000Z,2019-11-24T05:27:58.000Z,Bid and Proposal Manager - Cyber Security Serv...,New role for proven bid and proposal manager t...,Enosys Solutions Pty Ltd,"CBD, Inner West & Eastern Suburbs",Full Time,Information & Communication Technology,,AnnualCommission,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,Only Australian Citizens or Permanent Resident...,[Only Australian Citizens or Permanent Residen...
339,120000-150000,https://www.seek.com.au/job/40125574,Marketing Executive - Conversions & Expansion ...,2019-10-09T04:48:58.000Z,2019-11-08T04:48:56.000Z,Marketing Executive - Conversions & Expansion ...,Growing Hormone Clinic looking for Marketing S...,The Lucy Rose Clinic,"CBD, Inner West & Eastern Suburbs",Full Time,Marketing & Communications,,AnnualPackage,"<div class=""_2e4Pi2B"" data-automation=""mobileT...",Sydney,The Lucy Rose Clinic is a leading integrative ...,[]
