# S.D. WARN notices

Goal: Scrape [the table of WARN notices in South Dakota](https://dlr.sd.gov/workforce_services/businesses/warn_notices.aspx) into a CSV (including links to PDFs, if available).

In [None]:
# generally want to import libraries at the top of your scripts or notebooks

# part of the standard library - we'll use it to write the data into a CSV file
# https://docs.python.org/3/library/csv.html
import csv

# now import the third-party libraries installed separately

# for making HTTP requests
# https://requests.readthedocs.io/
import requests

# for parsing HTML into Python data types
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup

In [None]:
# define a list of headers for the CSV
headers = [
    'company',
    'city',
    'date',
    'num_employees',
    'pdf_link'
]

In [None]:
# fetch the page
req = requests.get('https://dlr.sd.gov/workforce_services/businesses/warn_notices.aspx')

# check for HTTP errors
# https://requests.readthedocs.io/en/latest/api/#requests.Response.raise_for_status
req.raise_for_status()

In [None]:
# take a quick peek at the HTML -- the .text attribute of the request
req.text

In [None]:
# turn that HTML into soup using the default HTML parser
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use
soup = BeautifulSoup(req.text, 'html.parser')

In [None]:
type(soup)

In [None]:
# find the table (it's the only one on the page - Ctrl+F for <table in the source code)
# the find() method returns a single element, if it exists, or None if it doesn't
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find
table = soup.find('table')

In [None]:
table

In [None]:
# find all of the the rows (the `tr` or "table row" element) in the table
# the find_all() method returns a list of elements, if they exist, or an
# empty list if they don't
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
rows = table.find_all('tr')

In [None]:
rows

In [None]:
# count the number of rows -- the number of items in the
# list returned by calling the find_all() method -- using the
# built-in Python function len()
# https://docs.python.org/3/library/functions.html#len
len(rows)

In [None]:
# start an empty list to collect the parsed data
data = []

# use a for loop to iterate over the rows and extract the data
# https://docs.python.org/3/tutorial/controlflow.html#for-statements
# https://realpython.com/python-for-loop/

# also we're going to use list indexing to skip the first row of headers
# in the table (we're supplying our own)
# https://realpython.com/lessons/indexing-and-slicing/
for row in rows[1:]:

    # all of the code indented to this level will run for
    # each item in the `rows` list, and we're using the
    # variable `row` to refer to the current item in the iteration --
    # in other words, this code will be applied to the first item,
    # then to the second, etc. until the list is exhausted
    # https://www.askpython.com/python/python-indentation

    # within this row, find all of the cells
    # (td, or "table data" elements)
    cells = row.find_all('td')

    # the company info is in the first [0] cell
    # (counting in python starts at 0)
    company_cell = cells[0]

    # ... but we just want the text for the name, so: company_cell.text
    # ... but we also want to do a little cleaning with a common
    # trick to strip all extraneous whitespace from a string -- splitting and
    # then rejoining it on a single space, so: ' '.join(company_cell.text.split())
    # https://stackoverflow.com/a/1546251
    company = ' '.join(company_cell.text.split())

    # some but not all of these cells contain an anchor tag linking to the PDF, so
    # we need to use an if statement to check for that here
    # https://docs.python.org/3/tutorial/controlflow.html#if-statements
    pdf_tag = company_cell.find('a')
    
    if pdf_tag:

        # note the indentation level -- all code at this indentation level
        # will fire only if the condition in the top line resolves to True
        # (if calling the find() method to search for an anchor tag (`a`)
        # returns an element rather than None
        # https://www.freecodecamp.org/news/truthy-and-falsy-values-in-python/

        # if that anchor tag exists, grab the relative link stored in its `href` attribute ...
        # annoyingly, some but not all of these URLs have a leading slash, so the solution is
        # to use the `lstrip()` ("left strip") function to remove any leading slashes from all
        # of the links, then we'll make sure to separate the base URL with a slash when we build a
        # fully qualified URL in the next step
        # https://docs.python.org/3/library/stdtypes.html#str.lstrip
        pdf_href = pdf_tag['href'].lstrip('/')

        # use an f-string to build the fully qualified URL
        # https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings
        pdf_link = f'https://dlr.sd.gov/{pdf_href}'
    # or if no a tag, set the `pdf_link` variable to an empty string
    else:
        pdf_link = ''
    
    # city is in the next [1] cell
    city = ' '.join(cells[1].text.split())

    # date is in the next [2] cell
    date = ' '.join(cells[2].text.split())

    # 💪 extra credit: figure out how to use the `datetime` module to validate
    # this date string, then reformat it as a YYYY-MM-DD date string, using the methods
    # strftime(), strptime() and isoformat()
    # https://docs.python.org/3/library/datetime.html

    # number of employees is the last [3] cell
    num_employees = ' '.join(cells[3].text.split())

    # build a list of data for this row, making sure to maintain the same order
    # as the CSV headers defined above
    row_data = [
        company,
        city,
        date,
        num_employees,
        pdf_link
    ]

    # append this list of data to the tracking list above
    # https://docs.python.org/3/tutorial/datastructures.html#more-on-lists
    data.append(row_data)

In [None]:
# take a quick peek at out our list of lists -- which conveniently is exactly
# what the csv.writer object we're about to create is expecting
# https://docs.python.org/3/library/csv.html#writer-objects
data

In [None]:
# https://docs.python.org/3/library/csv.html#examples
# in a with block, open a file called 'warn-sd-data.csv' in
# write mode 'w', specifying a blank newline to account for
# how PCs render newline characters, specifying utf-8 encoding,
# referring to this open file object as the variable "outfile" (arbitrary),
# open a CSV file to write your data to
with open('warn-sd-data.csv', 'w', newline='', encoding='utf-8') as outfile:
    
    # create a csv.writer object attached to the file handler
    writer = csv.writer(outfile)

    # write the first row into your CSV file -- the list of headers defined above
    # the writerow() method expects a single list
    writer.writerow(headers)

    # write the rows of parsed data to file, which are stored as a list of lists in
    # the `data` variable above
    # the writerows() method (plural!) expects a list of lists
    writer.writerows(data)