In [1]:
import bs4, waybackpy, requests, pytz, html2csv

In [2]:
arcapi = waybackpy.WaybackMachineCDXServerAPI(
    url="https://aspr.hhs.gov/SNS/Pages/JYNNEOS-Distribution.aspx",
    user_agent="Jynneos scraper [https://github.com/jlumbroso/us-2022-jynneos-distribution-by-day]"
)

In [3]:
x = arcapi.oldest()

In [4]:
x.archive_url

'https://web.archive.org/web/20220706223613/https://aspr.hhs.gov/SNS/Pages/JYNNEOS-Distribution.aspx'

In [5]:
s = bs4.BeautifulSoup(requests.get(x.archive_url).content)

In [6]:
s.find("div", {"class": "table-responsive"}).find("table")

<table class="table table-bordered table-condensed table-sticky">
<thead>
<tr>
<th scope="col">Jurisdiction</th>
<th scope="col">Total Distribution</th>
</tr>
</thead>
<tbody>
<tr class="lightblue">
<th class="lightblue" scope="row">
<strong>All Jurisdictions (patient courses)</strong></th>
<td class="lightblue">
<b>41,520</b></td>
</tr>
<tr>
<td align="left" style="text-align: left;">Alaska</td>
<td align="right">20</td>
</tr>
<tr>
<td align="left" style="text-align: left;">Arizona</td>
<td align="right">118</td>
</tr>
<tr>
<td align="left" style="text-align: left;">California</td>
<td align="right">12,156</td>
</tr>
<tr>
<td align="left" style="text-align: left;">Chicago</td>
<td align="right">5,409</td>
</tr>
<tr>
<td align="left" style="text-align: left;">Colorado</td>
<td align="right">1,340</td>
</tr>
<tr>
<td align="left" style="text-align: left;">Connecticut</td>
<td align="right">10</td>
</tr>
<tr>
<td align="left" style="text-align: left;">Delaware</td>
<td align="right">16</

In [7]:
import locale

STRIP_NUMERIC_CHARS = "*\u200b"

locale.setlocale(locale.LC_NUMERIC, "en_US.UTF-8")

def convert_if_numeric(s):
    try:
        return locale.atoi(s.strip(STRIP_NUMERIC_CHARS))
    except ValueError:
        return s.strip()

def make_table_numeric(list_of_lists):

    convd_list_of_lists = [
        [
            convert_if_numeric(cell)
            for cell in row
        ]
        for row in list_of_lists
    ]

    return convd_list_of_lists

In [8]:
singl_fields

NameError: name 'singl_fields' is not defined

In [14]:
BASE_URL = "https://aspr.hhs.gov/SNS/Pages/JYNNEOS-Distribution.aspx"

def convert_table(list_of_list):
    [
        [ cell for cell in row ]
        for row in list_of_list
    ]

SUBSTITUTIONS = {
    'Allocation 2022-06-29 To 2022-07-08': 'Allocation 2022-06-29 to 2022-07-08',
    'Allocation 2022-07-08 To 2022-07-15': 'Allocation 2022-07-08 to 2022-07-15',
    'Allocation 2022-07-16 To 2022-07-29': 'Allocation 2022-07-16 to 2022-07-29',
    'Allocation 2022-07-29 To Onwards': 'Allocation 2022-07-29 to onwards',
    'Allocation July 29' : 'Allocation 2022-07-29 to onwards',
    'AllocationJune 28-July 27' : 'Allocation 2022-06-28 to 2022-07-27',
    'Jurisdiction': 'Jurisdiction',
    'Total    Allocation': 'Total allocation',
    'Total Allocation': 'Total allocation',
    'Total Distribution': 'Total distribution',
    'Total Distribution (Doses)\u200b': 'Total distribution',
    'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b': 'Total shipped as of 2022-07-27',
    'Total Requested': 'Total requested',
    'Total Requested as of July 27 at 8AM': 'Total requested as of 2022-07-27',
    'Total Requestedas of Aug 03 2022, 12pm': 'Total requested as of 2022-08-03',
    'Total Requestedas of Aug 08 2022, 12pm': 'Total requested as of 2022-08-08',
    'Total Requestedas of Aug 10 2022, 12pm': 'Total requested as of 2022-08-10',
    'Total Requestedas of Aug 12\u200b 2022, 12pm': 'Total requested as of 2022-08-12',
    'Total Shippedas of Aug 03 2022, 12pm': 'Total shipped as of 2022-08-03',
    'Total Shippedas of Aug 08 2022, 12pm': 'Total shipped as of 2022-08-08',
    'Total Shippedas of Aug 10 2022, 12pm': 'Total shipped as of 2022-08-10',
    'Total Shippedas of Aug 12 2022, 12\u200bpm': 'Total shipped as of 2022-08-12',
    'Total Shipped\xa0(Doses)\u200b': 'Total shipped',
}

def fetch_jynneos_table_near(year, month, day):
    archive = waybackpy.WaybackMachineCDXServerAPI(
        url=BASE_URL,
        user_agent="Jynneos scraper [https://github.com/jlumbroso/us-2022-jynneos-distribution-by-day]"
    )

    near = archive.near(year=year, month=month, day=day)
    
    r = requests.get(near.archive_url)
    s = bs4.BeautifulSoup(r.content)
    
    table_element = s.find("div", {"class": "table-responsive"}).find("table")

    raw_tables = Converter().convert_to_list(str(table_element))
    
    if len(raw_tables) == 0:
        return

    raw_table = raw_tables[0]
    num_table = make_table_numeric(raw_table)

    header_row = num_table[0]
    body_rows = num_table[1:]
    entries = [
        dict(zip(header_row, row))
        for row in body_rows
    ]

    # clean entries
    for entry in entries:

        # normalize the "All Jurisdiction" caption
        if "all " in entry["Jurisdiction"].lower():
            entry["Jurisdiction"] = "All"

        # substitute some key names
        for old, new in SUBSTITUTIONS.items():
            if old == new:
                continue
            if old in entry:
                entry[new] = entry[old]
                del entry[old]
    
    return entries


In [None]:
table_element = bs4.BeautifulSoup(requests.get("https://web.archive.org/web/20220706223613/https://aspr.hhs.gov/SNS/Pages/JYNNEOS-Distribution.aspx").content).find("div", {"class": "table-responsive"}).find("table")

In [None]:
make_table_numeric(Converter().convert_to_list(str(table_element))[0])

[['Jurisdiction', 'Total Distribution'],
 ['All Jurisdictions (patient courses)', 41520],
 ['Alaska', 20],
 ['Arizona', 118],
 ['California', 12156],
 ['Chicago', 5409],
 ['Colorado', 1340],
 ['Connecticut', 10],
 ['Delaware', 16],
 ['District of Columbia', 1706],
 ['Florida', 503],
 ['Georgia', 238],
 ['Hawaii', 557],
 ['Illinois', 126],
 ['Indiana', 520],
 ['Iowa', 26],
 ['Kentucky', 70],
 ['Los Angeles', 7346],
 ['Louisiana', 20],
 ['Maine', 40],
 ['Maryland', 158],
 ['Massachusetts', 2204],
 ['Michigan', 24],
 ['Minnesota', 40],
 ['Mississippi', 10],
 ['Missouri', 4],
 ['Nebraska', 50],
 ['Nevada', 4],
 ['New    Jersey', 300],
 ['New York', 31],
 ['New    York City', 7169],
 ['North Carolina', 60],
 ['Ohio', 4],
 ['Oklahoma', 36],
 ['Oregon', 219],
 ['Pennsylvania', 98],
 ['Puerto    Rico', 40],
 ['Rhode Island', 20],
 ['South    Carolina', 34],
 ['Utah', 20],
 ['Texas', 166],
 ['Virginia', 512],
 ['Washington', 68],
 ['Wisconsin', 28]]

In [11]:
import script
import importlib
importlib.reload(script)

<module 'script' from '/Users/jlumbroso/Programming/scrapers/us-2022-jynneos-distribution-by-day/script.py'>

In [15]:
start = (2022, 7, 1)
year, month, day = start
prev_data = None

data = dict()


KEY = "Jurisdiction"

while not script.in_future(year, month, day):
    new_data = fetch_jynneos_table_near(year, month, day)
    
    if prev_data is None or [i for i in new_data if i not in prev_data] != []:

        for entry in new_data:

            # skip corrupt entries
            if KEY not in entry:
                continue
            
            data_key = entry[KEY]
            data[data_key] = data.get(data_key, dict())

            for key, value in entry.items():
                if key == KEY:
                    continue
                
                data[data_key]["{} ({}-{:02}-{:02})".format(key, year, month, day)] = value
            

    year, month, day = script.next_day(year, month, day)
    prev_data = new_data
    

In [17]:
data["New Jersey"]

{'Total distribution (2022-07-14)': 2813,
 'Total distribution (2022-07-18)': 2813,
 'Total allocation (2022-07-22)': 5449,
 'Total requested (2022-07-22)': 5509,
 'Total shipped (2022-07-22)': 5509,
 'Total allocation (2022-07-27)': 5449,
 'Total requested (2022-07-27)': 5509,
 'Total shipped (2022-07-27)': 5509,
 'Allocation 2022-07-29 to onwards (2022-07-29)': 14520,
 'Allocation 2022-06-28 to 2022-07-27 (2022-07-29)': 5449,
 'Total allocation (2022-07-29)': 19969,
 'Total deployed as of 2022-07-27 (2022-07-29)': 5509,
 'Total requested (2022-07-29)': 5509,
 'Allocation 2022-07-29 to onwards (2022-08-02)': 14520,
 'Allocation 2022-06-28 to 2022-07-27 (2022-08-02)': 5449,
 'Total allocation (2022-08-02)': 19969,
 'Total deployed as of 2022-07-27 (2022-08-02)': 5509,
 'Total requested as of 2022-07-27 (2022-08-02)': 5509,
 'Allocation 2022-06-29 to 2022-07-08 (2022-08-05)': 347,
 'Allocation 2022-07-08 to 2022-07-15 (2022-08-05)': 2406,
 'Allocation 2022-07-16 to 2022-07-29 (2022-08-0

In [None]:
fetch_jynneos_table_near(2022, 7, 31)

[{'Jurisdiction': 'All',
  'Allocation 2022-07-29 to onwards': 736620,
  'Allocation 2022-06-28 to 2022-07-27': 333218,
  'Total allocation': 1069838,
  'Total deployed as of 2022-07-27': 336710,
  'Total requested': 337016},
 {'Jurisdiction': 'Alabama',
  'Allocation 2022-07-29 to onwards': 4640,
  'Allocation 2022-06-28 to 2022-07-27': 1283,
  'Total allocation': 5923,
  'Total deployed as of 2022-07-27': 1283,
  'Total requested': 1283},
 {'Jurisdiction': 'Alaska',
  'Allocation 2022-07-29 to onwards': 480,
  'Allocation 2022-06-28 to 2022-07-27': 120,
  'Total allocation': 600,
  'Total deployed as of 2022-07-27': 102,
  'Total requested': 102},
 {'Jurisdiction': 'Arizona',
  'Allocation 2022-07-29 to onwards': 10820,
  'Allocation 2022-06-28 to 2022-07-27': 3495,
  'Total allocation': 14315,
  'Total deployed as of 2022-07-27': 3613,
  'Total requested': 3613},
 {'Jurisdiction': 'Arkansas',
  'Allocation 2022-07-29 to onwards': 2300,
  'Allocation 2022-06-28 to 2022-07-27': 724,
 

In [None]:
for i in range(1,32):
    print(i, ":")
    data = fetch_jynneos_table_near(2022, 7, i)
    print(data[:3])

1 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distribution': 118}]
2 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distribution': 118}]
3 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distribution': 118}]
4 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distribution': 118}]
5 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distribution': 118}]
6 :
[{'Jurisdiction': 'All', 'Total distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total distribution': 20}, {'Jurisdiction': 'Arizona', 'Total distrib

In [None]:
fields = list()
for i in range(1,32):
    print(i, ":")
    data = fetch_jynneos_table_near(2022, 7, i)
    keys = list(data[0].keys())
    fields += keys
    print(fetch_jynneos_table_near(2022, 7, i)[:3])
print(fields)

1 :
[{'Jurisdiction': 'All Jurisdictions (patient courses)', 'Total Distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total Distribution': 20}, {'Jurisdiction': 'Arizona', 'Total Distribution': 118}]
2 :
[{'Jurisdiction': 'All Jurisdictions (patient courses)', 'Total Distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total Distribution': 20}, {'Jurisdiction': 'Arizona', 'Total Distribution': 118}]
3 :
[{'Jurisdiction': 'All Jurisdictions (patient courses)', 'Total Distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total Distribution': 20}, {'Jurisdiction': 'Arizona', 'Total Distribution': 118}]
4 :
[{'Jurisdiction': 'All Jurisdictions (patient courses)', 'Total Distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total Distribution': 20}, {'Jurisdiction': 'Arizona', 'Total Distribution': 118}]
5 :
[{'Jurisdiction': 'All Jurisdictions (patient courses)', 'Total Distribution': 41520}, {'Jurisdiction': 'Alaska', 'Total Distribution': 20}, {'Jurisdiction': 'Arizona', 'Total Distribution': 1

In [None]:
for i in range(1,32):
    print(i, ":")
    data = fetch_jynneos_table_near(2022, 8, i)
    keys = list(data[0].keys())
    fields += keys
    print(data[:3])
print(fields)

1 :
[{'Jurisdiction': 'All    Jurisdictions', 'AllocationJune 28-July 27': 333218, 'Allocation July 29': '736,620*\u200b', 'Total    Allocation': 1069838, 'Total Requested': 337016, 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b': 336710}, {'Jurisdiction': 'Alabama', 'AllocationJune 28-July 27': 1283, 'Allocation July 29': 4640, 'Total    Allocation': 5923, 'Total Requested': 1283, 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b': 1283}, {'Jurisdiction': 'Alaska', 'AllocationJune 28-July 27': 120, 'Allocation July 29': 480, 'Total    Allocation': 600, 'Total Requested': 102, 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b': 102}]
2 :
[{'Jurisdiction': 'All    Jurisdictions', 'AllocationJune 28-July 27': 333218, 'Allocation July 29': '736,620*\u200b', 'Total    Allocation': 1069838, 'Total Requested as of July 27 at 8AM': 337016, 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b': 336710}, {'Jurisdiction': 'Alabama', 'AllocationJune 28-July

In [None]:
singl_fields = set(fields)

In [None]:
singl_fields

{'Allocation 2022-06-29 To 2022-07-08',
 'Allocation 2022-07-08 To 2022-07-15',
 'Allocation 2022-07-16 To 2022-07-29',
 'Allocation 2022-07-29 To Onwards',
 'Allocation July 29',
 'AllocationJune 28-July 27',
 'Jurisdiction',
 'Total    Allocation',
 'Total Allocation',
 'Total Distribution',
 'Total Distribution (Doses)\u200b',
 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b',
 'Total Requested',
 'Total Requested as of July 27 at 8AM',
 'Total Requestedas of Aug 03 2022, 12pm',
 'Total Requestedas of Aug 08 2022, 12pm',
 'Total Requestedas of Aug 10 2022, 12pm',
 'Total Requestedas of Aug 12\u200b 2022, 12pm',
 'Total Shippedas of Aug 03 2022, 12pm',
 'Total Shippedas of Aug 08 2022, 12pm',
 'Total Shippedas of Aug 10 2022, 12pm',
 'Total Shippedas of Aug 12 2022, 12\u200bpm',
 'Total Shipped\xa0(Doses)\u200b'}

In [None]:
for i in range(1,32):
    print(i, ":")
    print(fetch_jynneos_table_near(2022, 8, i)[:3])

1 :
[['Jurisdiction', 'AllocationJune 28-July 27', 'Allocation July 29', 'Total    Allocation', 'Total Requested', 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b'], ['All    Jurisdictions', '333,218', '736,620*\u200b', '1,069,838', '337,016', '336,710'], ['Alabama', '1,283', '4,640', '5,923', '1,283', '1,283']]
2 :
[['Jurisdiction', 'AllocationJune 28-July 27', 'Allocation July 29', 'Total    Allocation', 'Total Requested as of July 27 at 8AM', 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b'], ['All    Jurisdictions', '333,218', '736,620*\u200b', '1,069,838', '337,016', '336,710'], ['Alabama', '1,283', '4,640', '5,923', '1,283', '1,283']]
3 :
[['Jurisdiction', 'AllocationJune 28-July 27', 'Allocation July 29', 'Total    Allocation', 'Total Requested as of July 27 at 8AM', 'Total Doses    Shipped or Deployedas of July 27, 8AM\u200b'], ['All    Jurisdictions', '333,218', '736,620*\u200b', '1,069,838', '337,016', '336,710'], ['Alabama', '1,283', '4,640', '5,923',

In [None]:
open("test.csv","w").write(html2csv.Converter().convert(s.find("div", {"class": "table-responsive"}).find("table").prettify())[0][0])

713

In [None]:
Converter().convert_to_list(s.find("div", {"class": "table-responsive"}).find("table").prettify())

[[['Jurisdiction', 'Total Distribution'],
  ['All Jurisdictions (patient courses)', '41,520'],
  ['Alaska', '20'],
  ['Arizona', '118'],
  ['California', '12,156'],
  ['Chicago', '5,409'],
  ['Colorado', '1,340'],
  ['Connecticut', '10'],
  ['Delaware', '16'],
  ['District of Columbia', '1,706'],
  ['Florida', '503'],
  ['Georgia', '238'],
  ['Hawaii', '557'],
  ['Illinois', '126'],
  ['Indiana', '520'],
  ['Iowa', '26'],
  ['Kentucky', '70'],
  ['Los Angeles', '7,346'],
  ['Louisiana', '20'],
  ['Maine', '40'],
  ['Maryland', '158'],
  ['Massachusetts', '2,204'],
  ['Michigan', '24'],
  ['Minnesota', '40'],
  ['Mississippi', '10'],
  ['Missouri', '4'],
  ['Nebraska', '50'],
  ['Nevada', '4'],
  ['New    Jersey', '300'],
  ['New York', '31'],
  ['New    York City', '7,169'],
  ['North Carolina', '60'],
  ['Ohio', '4'],
  ['Oklahoma', '36'],
  ['Oregon', '219'],
  ['Pennsylvania', '98'],
  ['Puerto    Rico', '40'],
  ['Rhode Island', '20'],
  ['South    Carolina', '34'],
  ['Utah', '20'

In [13]:
import csv
import io

import bs4


def detect_engine():
    try:
        import lxml
    except ImportError:
        engine = 'html.parser'
    else:
        engine = 'lxml'
    return engine


class Converter:

    def __init__(self, **kwargs):
        engine = kwargs.get('engine')
        if engine is None:
            self.engine = detect_engine()
        else:
            self.engine = engine
        self.params = kwargs

    def convert_to_list(self, html_doc):
        soup = bs4.BeautifulSoup(html_doc, self.engine)
        tables = []
        for table_element in soup.find_all('table'):
            table = []
            for tr in table_element.find_all('tr'):
                row = [''.join(cell.stripped_strings)
                       for cell in tr.find_all(['td', 'th'])]
                table.append(row)
            tables.append(table)
        return tables

    def convert_to_csv(self, html_doc):
        tables = self.convert_to_list(html_doc)
        output = []
        for table_num, table in enumerate(tables):
            csv_string = io.StringIO()
            csv_writer = csv.writer(csv_string)
            for row in table:
                csv_writer.writerow(row)
            table_attrs = dict(num=table_num)
            output.append((csv_string.getvalue(), table_attrs))
        return output

    def convert(self, html_doc):
        return self.convert_to_csv(html_doc)

