In [0]:
# import all dependencies at the top
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import requests


# define a function to process the page
def process_page(soup, jobs):  
  
  # find all elements with class *-job-summary*
  raw_jobs = soup.select('.-job-summary')

  # same as above, extract the info we need
  for job in raw_jobs:
    title = job.select_one('.-title > h2 > a').get_text() # extract the title
    company = job.select_one('.-company > span:nth-of-type(1)').get_text().strip() # extract the company
    tags = [tag.get_text() for tag in job.select('.-tags a')] # extract a list of tags
    job = {'title': title, 'company': company, 'tags': tags} # construct a dictionary
    jobs.append(job) # add dictionary to list

    
# prepare for the monitoring logic
start_time = time() # note the system time when the program starts
request_count = 0 # track the number of requests made

# create a list to store the data in
jobs = []

# variables to handle the request loop
has_next_page = True
MAX_REQUESTS = 100 # do not request more than 100 pages
page_number = 1
url = 'https://stackoverflow.com/jobs'
headers = {'user-agent': 'jobscraper - school project (myeamail@gmail.com)'}

while has_next_page and request_count <= MAX_REQUESTS:
  # keep the output clear
  clear_output(wait = True)
  
  # make an initial request
  query = {'sort':'i', 'pg': page_number}
  response = requests.get(url, params=query, headers=headers)

  # make sure we got a valid response
  if(response.ok):
    # get the full data from the response
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    process_page(soup, jobs)

    # check for the next page
    # look for the presence of element with class *test-pagination-next*
    next_button = soup.select('.test-pagination-next')
    has_next_page = len(next_button) > 0
    
  else:
    # display a warning if there are any problems
    warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))
  
  request_count += 1
  
  # go to sleep for a bit
  # we use a random number between 1 and 5 so
  # We can wait as long as 5 seconds to make a second request
  
  sleep(randint(1,3))
  
  # output some logs for monitoring
  elapsed_time = time() - start_time
  print('Requests: {}, Frequency: {} requests/s, {} jobs processed.'.format(request_count, request_count/elapsed_time, len(jobs)))
  
  # prepare for next iteration
  page_number += 1
      
print('Scraping complete')
print('Requests: {}, Frequency: {} requests/s, {} jobs processed.'.format(request_count, request_count/elapsed_time, len(jobs)))