# Web Scraping uisng BeautifulSoup: Scrape job advertisements in JobsDB

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
directory='C:\\Users\\lairx78\\Desktop\\python'
os.chdir(directory)

Create a class to store functions that can scrape job advertisements in JobsDB by selecting occupation and number of pages

In [3]:
class web_scraping:

    def __init__(self, occupation, pages):
        
        self.occupation = occupation  
        self.pages = pages
        self.df_cols = ['job_title', 'company', 'job_description', 'salary', 'location', 'job_highlights', 'job_link']

    def jobsdb_link(self, job_dict):

        url_link = 'https://hk.jobsdb.com'+ job_dict['job_link']

        response_link = requests.get(url_link)
        html_link = response_link.text

        soup_link = BeautifulSoup(html_link, 'lxml')

        # find job description
        job_description = ' '.join(soup_link.find('div',{'data-automation':'jobDescription'}).find_all(string=True))
        job_description = job_description.replace('\xa0', '')
        job_description = job_description + ' '
        job_dict['job_description'] = job_description

        # find addtional information
        for i in range(len(soup_link.find('div',{'class':'z1s6m00 _5135ge0 _5135ge7'}).find_all('div', {'class':'z1s6m00 _1hbhsw6r pmwfa50 pmwfa57'}))):
            addition_info = soup_link.find('div',{'class':'z1s6m00 _5135ge0 _5135ge7'}).find_all('div', {'class':'z1s6m00 _1hbhsw6r pmwfa50 pmwfa57'})[i].find_all(string=True)
            if len(addition_info)>2:
                addition_info=[addition_info[0], ' '.join(addition_info[1:])]
            job_dict[addition_info[0]] = addition_info[1]

        return job_dict
    
    def jobsdb(self, save=False):
        
        job_ad_count=0

        df_job_ad = pd.DataFrame(columns = self.df_cols)

        for p in range(1, self.pages+1):
            
            occupation_dash = self.occupation.replace(' ', '-')
            url = 'https://hk.jobsdb.com/hk/search-jobs/' + occupation_dash + '/' + str(p)

            response = requests.get(url)
            html = response.text
            soup = BeautifulSoup(html, 'lxml')

            outer_most_point=soup.find('div',{'class':'z1s6m00', 'data-automation':'jobListing'})

            # break for loops if no jon advertisment
            if type(outer_most_point)==type(None) or outer_most_point.text=='':
                break

            for i in outer_most_point:
                
                job_dict={}
                job_ad_count+=1

                # find job title
                job_dict['job_title'] = i.find('span',{'class':'z1s6m00'}).text

                # find company
                job_dict['company'] = i.find('span', {'class': 'z1s6m00 bev08l1 _1hbhsw64y _1hbhsw60 _1hbhsw6r'}).text

                # find salary
                # neither salary nor location
                if len(i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih'}))==0:
                    job_dict['salary'] = None
                # salary or location
                elif len(i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih'}))==1:
                    # no location (have salary)
                    if len(i.find_all('span', {'class': 'z1s6m00 bev08l1 _1hbhsw64y _1hbhsw60 _1hbhsw6r'}))==1:
                        job_dict['salary'] = i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih'})[0].text
                # salary and location
                elif len(i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih'}))==2:
                    job_dict['salary'] = i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih'})[1].text

                # find location
                if len(i.find_all('span', {'class': 'z1s6m00 bev08l1 _1hbhsw64y _1hbhsw60 _1hbhsw6r'}))==2:
                    job_dict['location'] = i.find_all('span', {'class': 'z1s6m00 bev08l1 _1hbhsw64y _1hbhsw60 _1hbhsw6r'})[1].text

                # find job highlights
                highlights_len = len(i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i21 _1d0g9qk4 y44q7i7'}))
                if highlights_len > 0:
                    job_highlights=''
                    for k in range(highlights_len):
                        if k==highlights_len-1:
                            job_highlights = job_highlights + ' ' + i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i21 _1d0g9qk4 y44q7i7'})[k].text
                        else:
                            job_highlights = job_highlights + ' ' + i.find_all('span', {'class': 'z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i21 _1d0g9qk4 y44q7i7'})[k].text + ','
                    job_dict['job_highlights']=job_highlights

                # find job_link
                if len(i.find_all('div',{'class':'z1s6m00 _1hbhsw6ce'}))==2:
                    div_link=i.find_all('div',{'class':'z1s6m00 _1hbhsw6ce'})[1]
                else:
                    div_link=i.find_all('div',{'class':'z1s6m00 _1hbhsw6ce'})[0]
                job_dict['job_link'] = div_link.find('a').attrs['href']

                for key in self.df_cols:
                    if key not in list(job_dict.keys()):
                        job_dict[key] = None

                # find job description and additional info
                job_dict = self.jobsdb_link(job_dict)

                #dave to dataframe
                for col in job_dict.keys():
                    if col not in df_job_ad.columns:
                        df_job_ad[col] = None

                for key in df_job_ad.columns:
                    if key not in list(job_dict.keys()):
                        job_dict[key] = None

                df_job_ad = pd.concat([df_job_ad, pd.Series(job_dict).to_frame().T], ignore_index=True)

        print('Number of job advertisement: ', job_ad_count)
    
        if save==True:
            df_job_ad.to_csv('jobsdb_' + self.occupation + '.csv', index=False)
       
        return df_job_ad


Run the function in the class to do the scraping by input two attributes for __init__ function

There are two attributes:
1. occupation: choosing which type of occupation you wanted to scrape
2. pages: choosing the number of pages you wanted to scrape (each pages contain 30 job advertisements)

In [4]:
# scrape 1 page which is 30 job posts for data analyst

web_scraping('data analyst',1).jobsdb(save=True)

Number of job advertisement:  30


Unnamed: 0,job_title,company,job_description,salary,location,job_highlights,job_link,Career Level,Qualification,Years of Experience,Job Type,Job Functions,Company Website
0,Senior Data Analyst (Business Planning & Analy...,Hutchison Telecommunications (Hong Kong) Limited,"Responsibilities: Perform various design, deve...",,Tsing Yi,"Knowledge with programming / data analysis, D...",/hk/en/job/senior-data-analyst-business-planni...,Entry Level,Non-Degree Tertiary,3 years,"Full Time, Permanent","Information Technology (IT) , DBA , Product ...",
1,Business Analyst / Assistant Business Analyst,BridgeBuilder Company Limited,"To cope with the rapid growth, we are looking...",,Kwai Hing,"Project Implementation, Good analytical-mind,...",/hk/en/job/business-analyst-assistant-business...,Middle,Non-Degree Tertiary,,"Full Time, Permanent","Information Technology (IT) , Application Spe...",
2,(Senior) Data Scientist / Engineer - Operation...,"TCL Corporate Research (Hong Kong) Co., Limited",Job Description This role would participate in...,,Shatin Area,"Perform mathematical modeling, Master Product...",/hk/en/job/senior-data-scientist-engineer-oper...,Middle,,,Full Time,"Engineering , Electrical / Electronics , Inf...",
3,(Senior) Data Scientist,Hong Kong Industrial Artificial Intelligence a...,Hong Kong Industrial Artificial Intelligence a...,,Tai Po Area,Contribute whole life cycle of product develo...,/hk/en/job/senior-data-scientist-1000030106150...,Middle,Postgraduate,7 years,"Full Time, Permanent","Sales, CS & Business Devpt , Business Develop...",
4,Analyst - 2023 Graduate Position,AVISTA Valuation Advisory Limited,Service: Business Valuation Location: Hong Kon...,,Wan Chai,Valuations for financial reporting and transa...,/hk/en/job/analyst-2023-graduate-position-1000...,Entry Level,Degree,,"Full Time, Permanent","Accounting , Financial Analyst , Banking / F...",http://www.avaval.com
5,Business Analyst,Citistore (Hong Kong) Limited,KEY RESPONSIBILITIES: Act as a bridge between ...,,Tsuen Wan Area,"Manage and monitor various IT projects, Min. ...",/hk/en/job/business-analyst-100003010610793?to...,Middle,Degree,5 years,"Full Time, Permanent","Information Technology (IT) , IT Management ,...",
6,System Analyst,"Panasonic Hong Kong Co., Limited",What you’ll be doing? Manage and participate i...,,Mong Kok,Stable MNC in-house exposure with flex work h...,/hk/en/job/system-analyst-100003010617784?toke...,Middle,Non-Degree Tertiary,3 years,"Full Time, Permanent","Information Technology (IT) , Support , Others",http://www.panasonic.com/about/overview.asp
7,Digital Analyst (Ref: MG),Hutchison Telecommunications (Hong Kong) Limited,Responsibilities: Take charge of digital data ...,,Tsing Yi,"Degree, 4+ yrs e-Commerce & digital marketing...",/hk/en/job/digital-analyst-ref%3A-mg-100003010...,Middle,Degree,5 years,"Full Time, Permanent","Marketing / Public Relations , Digital Market...",
8,Business Analyst (Financial Services),Hudson,Working as a Business Analyst/ Senior business...,,,"Business Analyst, Project Manager, Transforma...",/hk/en/job/business-analyst-financial-services...,Entry Level,Degree,,"Full Time, Permanent","Information Technology (IT) , Product Managem...",https://www.hudson.hk
9,System Analyst (F&B),Kabushikigaisha Limited,Responsibilities: Implements computer system r...,,Kowloon Bay,"MS SQL database, VB.net and Python, Business ...",/hk/en/job/system-analyst-f-b-100003010607521?...,Entry Level,Non-Degree Tertiary,4 years,Full Time,"Information Technology (IT) , Testing / QA , ...",


After scraping, the function will export a csv file to store all the job advertisements, read it as a dataframe

In [5]:
df_data_analyst = pd.read_csv('jobsdb_data analyst.csv', header=0)

df_data_analyst

Unnamed: 0,job_title,company,job_description,salary,location,job_highlights,job_link,Career Level,Qualification,Years of Experience,Job Type,Job Functions,Company Website
0,Senior Data Analyst (Business Planning & Analy...,Hutchison Telecommunications (Hong Kong) Limited,"Responsibilities: Perform various design, deve...",,Tsing Yi,"Knowledge with programming / data analysis, D...",/hk/en/job/senior-data-analyst-business-planni...,Entry Level,Non-Degree Tertiary,3 years,"Full Time, Permanent","Information Technology (IT) , DBA , Product ...",
1,Business Analyst / Assistant Business Analyst,BridgeBuilder Company Limited,"To cope with the rapid growth, we are looking...",,Kwai Hing,"Project Implementation, Good analytical-mind,...",/hk/en/job/business-analyst-assistant-business...,Middle,Non-Degree Tertiary,,"Full Time, Permanent","Information Technology (IT) , Application Spe...",
2,(Senior) Data Scientist / Engineer - Operation...,"TCL Corporate Research (Hong Kong) Co., Limited",Job Description This role would participate in...,,Shatin Area,"Perform mathematical modeling, Master Product...",/hk/en/job/senior-data-scientist-engineer-oper...,Middle,,,Full Time,"Engineering , Electrical / Electronics , Inf...",
3,(Senior) Data Scientist,Hong Kong Industrial Artificial Intelligence a...,Hong Kong Industrial Artificial Intelligence a...,,Tai Po Area,Contribute whole life cycle of product develo...,/hk/en/job/senior-data-scientist-1000030106150...,Middle,Postgraduate,7 years,"Full Time, Permanent","Sales, CS & Business Devpt , Business Develop...",
4,Analyst - 2023 Graduate Position,AVISTA Valuation Advisory Limited,Service: Business Valuation Location: Hong Kon...,,Wan Chai,Valuations for financial reporting and transa...,/hk/en/job/analyst-2023-graduate-position-1000...,Entry Level,Degree,,"Full Time, Permanent","Accounting , Financial Analyst , Banking / F...",http://www.avaval.com
5,Business Analyst,Citistore (Hong Kong) Limited,KEY RESPONSIBILITIES: Act as a bridge between ...,,Tsuen Wan Area,"Manage and monitor various IT projects, Min. ...",/hk/en/job/business-analyst-100003010610793?to...,Middle,Degree,5 years,"Full Time, Permanent","Information Technology (IT) , IT Management ,...",
6,System Analyst,"Panasonic Hong Kong Co., Limited",What you’ll be doing? Manage and participate i...,,Mong Kok,Stable MNC in-house exposure with flex work h...,/hk/en/job/system-analyst-100003010617784?toke...,Middle,Non-Degree Tertiary,3 years,"Full Time, Permanent","Information Technology (IT) , Support , Others",http://www.panasonic.com/about/overview.asp
7,Digital Analyst (Ref: MG),Hutchison Telecommunications (Hong Kong) Limited,Responsibilities: Take charge of digital data ...,,Tsing Yi,"Degree, 4+ yrs e-Commerce & digital marketing...",/hk/en/job/digital-analyst-ref%3A-mg-100003010...,Middle,Degree,5 years,"Full Time, Permanent","Marketing / Public Relations , Digital Market...",
8,Business Analyst (Financial Services),Hudson,Working as a Business Analyst/ Senior business...,,,"Business Analyst, Project Manager, Transforma...",/hk/en/job/business-analyst-financial-services...,Entry Level,Degree,,"Full Time, Permanent","Information Technology (IT) , Product Managem...",https://www.hudson.hk
9,System Analyst (F&B),Kabushikigaisha Limited,Responsibilities: Implements computer system r...,,Kowloon Bay,"MS SQL database, VB.net and Python, Business ...",/hk/en/job/system-analyst-f-b-100003010607521?...,Entry Level,Non-Degree Tertiary,4 years,Full Time,"Information Technology (IT) , Testing / QA , ...",
