# Import the packages

We will use csv so we can write our results to a csv file later 

In [1]:
import csv
import requests
from bs4 import BeautifulSoup

# Setup the URL

The template URL for reed jobs is https://www.reed.co.uk/jobs/{position}-in-{location} We need to define a function that will allow us to input what job position and location we want to search for.

In [2]:
def get_url(position, location):
    template = 'https://www.reed.co.uk/jobs/{}-jobs' #web address template
    position = position.replace(' ','-') #replaces the spaces with plus in url
    location = location.replace(' ', '-')
    url = template.format(position, location)
    return url

We need to call our function and assign it the the variable url as currently it is only assigned 
within the function and not globally.

In [3]:
url = get_url('data analyst','')

In [4]:
response = requests.get(url)

In [5]:
response

<Response [200]>

In [6]:
soup = BeautifulSoup(response.text, 'html.parser')


# Getting the information from the HTML

Reed.co.uk has promoted posts that appear at the top of each search page to avoid having these
appearing multiple times in our results we must remove them. The promoted posts have two classes job-result-card--promoted and job-result-card. We can use decompose to get rid of the job-result-card--promoted class as this class is only used for promoted posts.

In [7]:
for article in soup.find_all("article", class_="job-result-card--promoted"):
    article.decompose()

Now when we search for the class job-result-card we should only get non promoted jobs. We can use len() to check that the amount of job cards we have matched the amount of jobs on the page 

In [8]:
cards = soup.find_all("article", class_="job-result-card")
   
len(cards)


25

The number of cards matches the number of non-promoted job listings per page

# Prototype the model with a single record

Now we can try getting all the information we need for just one card 

In [9]:
card=(cards[0])


First we get the job title

In [10]:
job_title = card.h3.a.get('title')
print(job_title)

Data Analyst


Then we will get the company name

In [11]:
company_tag = card.find('a','gtmJobListingPostedBy')
company = company_tag.text.strip()
print(company)

Remit Resources


Next we must get the salary. Salary isn't available for all jobs so we must create an execption to 
avoid getting an error

In [12]:
salary_tag = card.find('li','job-metadata__item--salary')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = ''
print(salary)

£38,000 - £42,000 per annum


Next we will get the job location

In [13]:
location_tag = card.find('li', 'job-metadata__item--location')
if location_tag:
    location = location_tag.text.strip('')
    location = location.replace('\n','')
    location = location.replace('\r ','')
    location = location.replace('                   ','',1)
    location = location.replace('                   ',', ',)

else:
    location = ''
print(location)

Hatfield, Hertfordshire


We will get the job type if it is available

In [14]:
jobtype_tag = card.find('li','job-metadata__item--type')
if jobtype_tag:
    jobtype = jobtype_tag.text.strip()
else:
    jobtype = ''
print(jobtype)

Permanent, full-time


Next we check if the job is work from home like salary tag this is not always present so we must create and expection for when it is not present

In [15]:
wfh_tag = card.find('li','job-metadata__item--remote')
if wfh_tag:
    wfh = 'Work from home'
else:
    wfh = 'Not Work from home'
print(wfh)

Not Work from home


Next we get the url for the job

In [16]:

job_url = 'https://www.reed.co.uk' + card.a.get('href')
print(job_url)

https://www.reed.co.uk/jobs/data-analyst/47376202?source=searchResults&filter=%2fjobs%2fdata-analyst-jobs


Finally we can get the begining of the job description

In [17]:
description = card.find('p', 'job-result-description__details').text.strip()
print(description)

Data Analyst needed for this global tech business, in their North London / Hertfordshire Office (Hybrid working 2 days in the office). They are a high performance, data driven business who pride themselves on being the best at what they do. They are growing...


Now we must combine all of this information into a record

In [18]:
record = (job_title, company, location, salary, wfh, jobtype, description, job_url)

Next we check that all the information is present and displaying correctly

In [19]:
record

('Data Analyst',
 'Remit Resources',
 'Hatfield, Hertfordshire',
 '£38,000 - £42,000 per annum',
 'Not Work from home',
 'Permanent, full-time',
 'Data Analyst needed for this global tech business, in their North London / Hertfordshire Office (Hybrid working 2 days in the office). They are a high performance, data driven business who pride themselves on being the best at what they do. They are growing...',
 'https://www.reed.co.uk/jobs/data-analyst/47376202?source=searchResults&filter=%2fjobs%2fdata-analyst-jobs')

# Generalise model with a function

Now we will use a function fo generalise the model 

In [20]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h3.a.get('title')
    company_tag = card.find('a','gtmJobListingPostedBy')
    company = company_tag.text.strip()
    salary_tag = card.find('li','job-metadata__item--salary')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''
    location_tag = card.find('li', 'job-metadata__item--location')
    if location_tag:
        location = location_tag.text.strip('')
        location = location.replace('\n','')
        location = location.replace('\r ','')
        location = location.replace('                   ','',1)
        location = location.replace('                   ',', ',)

    else:
        location = ''

    description = card.find('p', 'job-result-description__details').text.strip()
    job_url = 'https://www.reed.co.uk' + card.a.get('href')
  
        
    record = (job_title, company, location, description, salary, job_url)
    return record

No we run the function and store the results into records 

In [21]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

Now we will write our results to a CSV file

In [22]:
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Salary', 'Work From Home', 'JobUrl'])
        writer.writerows(records)

# Getting the next page

Now we have the results for the first page we need to check if there are more pages 

In [23]:
while True:
    try:
        url = 'https://www.reed.co.uk' + soup.find('a', {'data-qa': 'nextPageLnk'}).get('href')
    except AttributeError:
        break
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for article in soup.find_all("article", class_="job-result-card--promoted"):
        article.decompose()
    cards = soup.find_all("article", class_="job-result-card")

    for card in cards:
        record = get_record(card)
        records.append(record)

# Putting it all together 

Now we can combine all of our code together

In [24]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(position, location, min_salary, max_salary):
    template = 'https://www.reed.co.uk/jobs/{}-jobs-in{}?salaryfrom={}&salaryto={}'
    position = position.replace(' ','-')
    url = template.format(position,location, min_salary, max_salary)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h3.a.get('title')
    company = card.find('a','gtmJobListingPostedBy').text.strip()
    location_tag = card.find('li', 'job-metadata__item--location')
    if location_tag:
        location = location_tag.text.strip('')
        location = location.replace('\n','')
        location = location.replace('\r ','')
        location = location.replace('                   ','',1)
        location = location.replace('                   ',', ',)

    else:
        location = ''
    job_url = 'https://www.reed.co.uk' + card.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('li','job-metadata__item--salary')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
    
    wfh_tag = card.find('li','job-metadata__item--remote')
    if wfh_tag:
        wfh = 'Work from home'
    else:
        wfh = 'Not work from home'
        
    jobtype_tag = card.find('li','job-metadata__item--type')
    if jobtype_tag:
        jobtype = jobtype_tag.text.strip()
    else:
        jobtype = ''
        
    description = card.find('p', 'job-result-description__details').text.strip()   
    
    record = (job_title, company, location, salary, wfh, jobtype, description, job_url)
    return record



def main(position, location, min_salary, max_salary):
    records = []
    url = get_url(position, location, min_salary, max_salary)
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for article in soup.find_all("article", class_="job-result-card--promoted"):
            article.decompose()
        cards = soup.find_all("article", class_="job-result-card")
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.reed.co.uk' + soup.find('a', {'data-qa': 'nextPageLnk'}).get('href')
        except AttributeError:
            break
        
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Salary', 'Work From Home', 'Job Type', 'Job Description', 'JobUrl'])
        writer.writerows(records)
    #job_title, company,location, job_url, salary, wfh,      jobtype, description, 

We can run the function now

In [25]:
main('Data analyst','','','')