In [None]:
# Import libraries

import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [None]:
# Start web scraping the Jobsdb website

url = 'https://hk.jobsdb.com/hk/search-jobs/data-analyst/1'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser')
soup

In [None]:
# Numerous job ads were shown in a Jobsdb page
# Create a variable "sectors", each job ad is stored inside "sectors" in the form of list

sectors = soup.find_all('div', class_='z1s6m00 _1hbhsw6n rqoqz1')
sectors

In [None]:
# Use the first job ad for further investigation

sector = sectors[0]

In [None]:
# Job title of the first job ad

sector.find('div',class_='z1s6m00 l3gun70 l3gun74 l3gun72').getText()

In [None]:
# Company name of the first job ad

sector.find('span',class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i21 y44q7ih').getText()

In [None]:
# Location of the first job ad

sector.find('span',class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih').getText()

In [None]:
# Description of the first job ad
# Data cleaning by using function

import re
def add_space(text):
    spaced_text = re.sub(r'(?<!^)(?=[A-Z])',' ',text)
    return spaced_text

add_space(sector.find('ul',class_='z1s6m00 z1s6m03 _5135ge0 _5135ge5').getText())

In [None]:
# Publishing date of the first job ad

sector.find('time',class_='z1s6m00 _1hbhsw64y').getText()

In [None]:
# The date of performing web scraping (data extract date)

today = datetime.today().strftime('%Y-%m-%d')

In [None]:
# Create a function to include all the web scraping methods metioned above
# Not all job ads have a specific information, e.g. job location. Thus, "try and except" is used to avoid errors

def get_record(sector):
    
    job_title = sector.find('div',class_='z1s6m00 l3gun70 l3gun74 l3gun72').getText()
    
    try:
        company = sector.find('span',class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i21 y44q7ih').getText()
    except AttributeError:
        company = ''
        
    try:    
        location = sector.find('span',class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih').getText()
    except AttributeError:
        location = ''
        
    try:
        description = add_space(sector.find('ul',class_='z1s6m00 z1s6m03 _5135ge0 _5135ge5').getText())
    except AttributeError:
        description = ''
        
    post_date = sector.find('time',class_='z1s6m00 _1hbhsw64y').getText()
    extract_date = datetime.today().strftime('%Y-%m-%d')
    
    record = (job_title, company, location, description, post_date, extract_date)
    return record

In [None]:
# For loop to grab all the information of every job ad within a page

records = []

for sector in sectors:
    record = get_record(sector)
    records.append(record)

In [None]:
# Investigate result

records

In [None]:
# Scrape every page in Jobsdb website by investigating the code of the "Next page" button of Jobsdb
# Escape the loop while no more "Next page" button in that page (i.e. final page is reached)

while True:
    try:
        url ='https://hk.jobsdb.com' + soup.find('div', class_ = 'z1s6m00 _1hbhsw6ce _1hbhsw6p').a.get('href')
    except AttributeError:
        break
    
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text,'html.parser')
    sectors = soup.find_all('div', class_='z1s6m00 _1hbhsw6n rqoqz1')
    
    for sector in sectors:
        record = get_record(sector)
        records.append(record)

In [None]:
# Save the result in a csv file

with open('result.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Job_title','Company','Location','Description','Post_date','Extract_date'])
    writer.writerows(records)