# IRE jobs

The goal: Scrape [the list of job postings on IRE's website](https://www.ire.org/find-a-job) into a CSV.

In [2]:
# for writing the data to CSV
import csv

# for handling HTTP traffic
import requests
# for parsing HTML
from bs4 import BeautifulSoup

In [3]:
# set up a dictionary to change the headers of the outgoing request to
# pretend to be a Firefox browser
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
# https://useragents.io
new_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/119.0'
}

In [4]:
# using the new headers, fetch the page
req = requests.get(
    'https://www.ire.org/find-a-job',
    headers=new_headers
)

# and check for HTTP errors
req.raise_for_status()

In [None]:
# take a quick peek at the text
req.text

In [6]:
# turn the HTML into a BeautifulSoup object using the standard parsing engine
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
soup = BeautifulSoup(req.text, 'html.parser')

In [None]:
# doublecheck that we're working with a BeautifulSoup object
type(soup)

In [8]:
# find the div with the ID 'ire-jobs'
div = soup.find('div', {'id': 'ire-jobs'})

# a shortcut method for searching by ID:
# div = soup.find(id='ire-jobs')

In [None]:
# take a quick peek at the div
div

In [10]:
# look for the links in each row -- the anchor or `a` tag -- and use it
# as the starting point for grabbing content in each row
links = div.find_all('a')

In [None]:
# start an empty list to hold the parsed data
data = []

# use a for loop to iterate over each link
for link in links:
    # first piece of data to grab is the actual URL, which is the
    # `href` attribute of the `a` tag
    url = link['href']

    # next is the text within the link --
    # calling .strip() as a precaution against leading/trailing whitespace
    job_title = link.text.strip()

    # to get the name of the newsroom and other pieces of data, we'll want to start
    # one level up, so use `.parent` to get the parent element
    parent_el = link.parent

    # the newsroom is in the text of the next sibling element
    newsroom = parent_el.next_sibling.text.strip()

    # the location is two siblings away
    location = parent_el.next_sibling.next_sibling.text.strip()

    # date posted is three siblings away
    date_posted = parent_el.next_sibling.next_sibling.next_sibling.text.strip()

    # 🤌 a shortcut method for accomplishing the same thing in one line using
    # the .next_siblings generator, a list comprehension and multiple assignment:
    # newsroom, location, date_posted = [x.text.strip() for x in link.parent.next_siblings]
    # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#next-siblings-and-previous-siblings

    # build the row of data in the order that they should land in the data file
    row_data = [
        job_title,
        newsroom,
        location,
        date_posted
    ]

    # and then add this row of data to the tracking list we started earlier
    data.append(row_data)

In [None]:
# check the results
data

In [None]:
# finally, write the results to file
# https://docs.python.org/3/library/csv.html

# define the CSV headers
csv_headers = [
    'job_title',
    'newsroom',
    'location',
    'date_posted'
]

with open('ire-jobs.csv', 'w', newline='', encoding='') as outfile:

    # create a writer object attached to this open file object
    writer = csv.writer(outfile)

    # write the first row of headers
    writer.writerow(csv_headers)

    # write the list of data we just built
    # note that the method is `writerows`, plural
    writer.writerows(data)