In [28]:
import csv

# will use this to pause briefly between requests, as a courtesy
from time import sleep

# pattern expansion -- will use this to get a handle to the HTML files
# we download later
from glob import glob

import requests
from bs4 import BeautifulSoup

In [11]:
# variable for the base URL, without query parameters
base_url = 'https://www.journalismjobs.com/job-listings'

# set up some initial query parameters -- start on page 1,
# show 100 results for each page
parameters = {
    'page': 1,
    'count': 100
}

In [12]:
# first task: figure out how many pages to iterate through
# by grabbing the last number in the pagination buttons at the bottom
req = requests.get(
    base_url,
    params=parameters
)

req.raise_for_status()

In [13]:
soup = BeautifulSoup(req.text, 'html.parser')

In [14]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<!-- Required meta tags -->
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="JournalismJobs.com has journalism job and media job listings for online media, newspapers, tv, radio, magazines, nonprofits, and academia." name="description">
<meta content="journalism jobs, media jobs, reporter jobs, writing jobs, coy editing jobs, news producer jobs, online producer jobs" name="keywords">
<link href="/assets/journalismjobs.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="/assets/journalismjobs.com/favicon.ico" rel="icon" type="image/x-icon"/>
<!-- Bootstrap CSS -->
<link crossorigin="anonymous" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" rel="stylesheet"/>
<link href="/css/fontawesome.min.css" rel="stylesheet">
<link href="/css/brands

In [15]:
pagination = soup.find('ul', {'class': 'pagination'})

In [16]:
pagination

<ul class="pagination">
<li class="page-item active"><a class="page-link" onclick="changePage(1)">1</a></li>
<li class="page-item"><a class="page-link" onclick="changePage(2)">2</a></li>
<li class="page-item"><a class="page-link" onclick="changePage(3)">3</a></li>
<li class="page-item"><a class="page-link bg-green" onclick="changePage(2)">Next <i class="fas fa-chevron-right" style="font-size: 0.85rem;"></i></a></li>
</ul>

In [17]:
# grab a list of `li` ("list element") elements
pages = pagination.find_all('li')

In [18]:
pages

[<li class="page-item active"><a class="page-link" onclick="changePage(1)">1</a></li>,
 <li class="page-item"><a class="page-link" onclick="changePage(2)">2</a></li>,
 <li class="page-item"><a class="page-link" onclick="changePage(3)">3</a></li>,
 <li class="page-item"><a class="page-link bg-green" onclick="changePage(2)">Next <i class="fas fa-chevron-right" style="font-size: 0.85rem;"></i></a></li>]

In [21]:
# for this website, the button for the last page is always the second-to-last `li` element
# (the last one is the "Next" button)
# use negative list indexing to grab the second-to-last item
# https://stackoverflow.com/a/11367936
last_page_button = pages[-2]

In [22]:
last_page_button

<li class="page-item"><a class="page-link" onclick="changePage(3)">3</a></li>

In [23]:
# grab just the text, and we're going to use it as a number, so grab the text and
# coerce the string to an integer
last_page = int(last_page_button.text)

In [24]:
last_page

3

In [27]:
# now we know how many pages to iterate through! set up a range and start graabbing pages
# note that the range excludes the last number, so we need to add 1 to the top end of the range
# https://docs.python.org/3/library/functions.html#func-range

# the goal is to fetch each page and write the content to file
for page_num in range(1, last_page+1):

    # use the page number to build the request
    req = requests.get(
        base_url,
        params={
            'page': page_num,
            'count': 100
        }
    )

    # write the HTML to file -- use an f-string to format the filename
    # https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings
    filename = f'journo-jobs-page-{page_num}.html'
    with open(filename, 'w') as outfile:
        outfile.write(req.text)

    # a little status message
    print(f'Wrote {filename}')

    # pause for half a second
    sleep(0.5)

Wrote journo-jobs-page-1.html
Wrote journo-jobs-page-2.html
Wrote journo-jobs-page-3.html


In [32]:
# use the glob module to get a handle to the HTML files we just wrote
# https://docs.python.org/3/library/glob.html
# the asterisk is a wildcard meaning "zero or more characters of any kind"
html_files = glob('*.html')

In [33]:
html_files

['journo-jobs-page-2.html',
 'journo-jobs-page-3.html',
 'journo-jobs-page-1.html']

In [60]:
# loop over those filenames, open each file and parse out the contents

# first, set up a tracking list for the parsed data
data = []

# loop over the file names
for file in html_files:

    # open the file in read mode and get a handle to the file contents
    with open(file, 'r') as infile:
        html = infile.read()

    # turn the HTML into soup
    soup = BeautifulSoup(html, 'html.parser')

    # get the main div
    div = soup.find('div', {'class': 'main'})

    # each row div is wrapped in an `a` tag, so grab those
    links = div.find_all('a', {'class': 'job-item'})

    # loop over each of those links
    for link in links:

        # grab the relative URL
        url = link['href']

        # turn it into a fully qualified URL using an f-string
        url = f'https://journalismjobs.com{url}'

        # grab the job title
        job_title = link.find('h3').text.strip()

        # newsroom
        newsroom = link.find('div', {'class': 'job-item-company'}).text.strip()

        # location, job type and date posted are all findable by looking up the icon,
        # then finding the text of the parent -- only if it exists (need to test)
        location = link.find('i', {'class': 'fa-map-marker-alt'})
        if location:
            location = location.parent.text.strip()

        job_type = link.find('i', {'class': 'fa-clock'})
        if job_type:
            job_type = job_type.parent.text.strip()

        date_posted = link.find('i', {'class': 'fa-calendar'})
        if date_posted:
            # remove "Posted " while we're at it
            date_posted = date_posted.parent.text.strip().replace('Posted ', '')

        # 💪 extra credit: figure out how to use the `datetime` module to add the
        # current year to the `date_posted` string
        
        # grab the list of industries
        industries = link.find('ul', {'class': 'job-item-industries'})
        
        # start an empty list to keep track of industries (or will remain empty if no list exists)
        industry_list = []
        if industries:
            for li in industries.find_all('li'):
                industry_list.append(li.text.strip())

        # join the list of industries with a comma and a space
        # https://docs.python.org/3/library/stdtypes.html#str.join
        industry_list = ', '.join(industry_list)
        
        row_data = [
            url,
            job_title,
            newsroom,
            location,
            job_type,
            date_posted,
            industry_list            
        ]

        data.append(row_data)

In [62]:
# write to file

# CSV headers, in order
csv_headers = [
    'url',
    'job_title',
    'newsroom',
    'location',
    'job_type',
    'date_posted',
    'industry_list'
]

with open('journo-jobs.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(csv_headers)
    writer.writerows(data)