# Web Scraping
Web scraping Seek.co.nz for job listings

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

Below, I've created a function which draws on the BeautifulSoup library.

The function I've defined scrapes multiple web pages for raw HTML data, parses the raw data into meaningful insights on job postings, and organises the insights into a dictionary so that it can easily be converted into a CSV file. 

In [4]:
# Defining a function named 'extract' which parses HTML data (job, company, industry, region, summary, etc) from multiple pages and 
# organises the data into a dictionary for easy conversion to CSV.

def extract(current, max):
    joblist = []
    max_page = max
    current_page = current
    while current_page <= max_page:
        current_url = f'https://www.seek.co.nz/data-analyst-jobs?page={current_page}'
        print(current_url)
        r = requests.get(current_url)
        soup = BeautifulSoup(r.content, 'lxml')
        div = soup.find_all('article', class_ = 'yvsb870 yvsb871 h3f08h1 _14uh9946i _14uh9947i _14uh9949m _14uh9948m _14uh9945a h3f08h5')
        for item in div:
            role = item.find('a', {'data-automation':'jobTitle'}).text
            company = item.find('a', {'data-automation':'jobCompany'}).text
            when_posted = item.find('span', {'data-automation':'jobListingDate'}).text
            try:
                sector = item.find('a', {'data-automation':'jobClassification'}).text
            except:
                sector = ''
            try:
                industry = item.find('a', {'data-automation':'jobSubClassification'}).text
            except:
                industry = ''
            try:
                region = item.find('a', {'data-automation':'jobLocation'}).text
            except:
                region = ''
            try:
                district = item.find('a',{'data-automation':'jobArea'}).text
            except:
                district = ''
            try:
                salary = item.find('span', {'data-automation':'jobSalary'}).text
            except:
                salary = ''
            try:
                summary = item.find('span', {'data-automation':'jobShortDescription'}).text.replace('•','')
            except:
                summary = ''
            job = {
                'role': role,
                'company': company,
                'when_posted': when_posted,
                'sector': sector,
                'industry': industry,
                'region': region,
                'district': district,
                'salary': salary,
                'summary': summary
                }
            joblist.append(job)
        current_page += 1
    
    return(joblist)


In [5]:
# Inserting the page number to start with, and end page number into the extract function, in order to extract data analyst job insights
# (URL links in blue show the pages loaded in real time)
df = extract(1,10)

https://www.seek.co.nz/data-analyst-jobs?page=1
https://www.seek.co.nz/data-analyst-jobs?page=2
https://www.seek.co.nz/data-analyst-jobs?page=3
https://www.seek.co.nz/data-analyst-jobs?page=4
https://www.seek.co.nz/data-analyst-jobs?page=5
https://www.seek.co.nz/data-analyst-jobs?page=6
https://www.seek.co.nz/data-analyst-jobs?page=7
https://www.seek.co.nz/data-analyst-jobs?page=8
https://www.seek.co.nz/data-analyst-jobs?page=9
https://www.seek.co.nz/data-analyst-jobs?page=10


In [6]:
# Converting the dictionary output from the extract function to a CSV file
df = pd.DataFrame(df)
df.to_csv('seek_data_analyst_10.csv')