In [None]:
import csv
from datetime import datetime
from urllib.parse import urljoin

import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup

from fixes import journo_fixes

In [None]:
# come on texas fix your https
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

URL = 'http://www.tdcj.state.tx.us/death_row/dr_media_witness_list.html'
CSV_FILE = 'tx-death-row-media-list.csv'

In [None]:
r = requests.get(URL, verify=False)
r.raise_for_status()

In [None]:
data = []

# find the table
table = BeautifulSoup(r.text, 'html.parser').find('table')

# get the rows (minus the headers)
rows = table.find_all('tr')[1:]

# loop over the rows
for row in rows:

    # get all the TD tags
    cells = row.find_all('td')

    execution_no = cells[0].text.strip()
    url = ''

    # see if there's a link
    link = row.find('a').get('href')
    if link:

        # build a fully qualified URL
        # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
        url = urljoin(
            URL,
            link
        )

    inmate_last = cells[2].text.strip()
    inmate_rest = cells[3].text.strip()
    inmate_no = cells[4].text.strip()

    # validate the execution date and format YYYY-MM-DD
    execution_date = datetime.strptime(
        cells[5].text.strip(),
        '%m/%d/%Y'
    ).date().isoformat()

    # grab the list of media (semicolon-delimited)
    media_list = [x.strip() for x in cells[6].string.split(';') if x.strip()]  # noqa

    for journo in media_list:

        # see if there's a fix listed
        journo = journo_fixes.get(journo, journo)

        # try to separate out the name pieces + affiliation
        try:
            journo_affiliation = journo.rsplit(',', 1)[1].strip()
            journo_name = journo.split(',')[0].strip()
            journo_rest, journo_last = journo_name.rsplit(' ', 1)
        except IndexError:
            journo_last = journo
            journo_rest = None
            journo_affiliation = None

        # add the data to the tracking list
        data.append({
            'execution_number': execution_no,
            'execution_date': execution_date,
            'journalist_name_last': journo_last,
            'journalist_name_rest': journo_rest,
            'journalist_affiliation': journo_affiliation,
            'inmate_number': inmate_no,
            'inmate_name_last': inmate_last,
            'inmate_name_rest': inmate_rest,
            'url': url
        })

# grab the CSV headers from the first saved record
csv_headers = data[0].keys()

In [None]:
with open(CSV_FILE, "w", newline="", encoding="utf-8") as outfile:
    writer = csv.DictWriter(outfile, fieldnames=csv_headers)
    writer.writeheader()
    writer.writerows(data)

print(f"Wrote {len(data):,} records to file.")