In [1]:
from datetime import datetime
from uuid import uuid4

from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import requests

In [2]:
url = "https://www.blm.gov/or/resources/recreation/rogue/rogue_river.php"

In [3]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'
dir_raw = dir_data / 'raw'

out_dir = dir_raw / 'rogue_retrieved_availability'

In [10]:
def get_row_values(tr):
    td_lst = tr.find_all('td')
    td_val_lst = [td.decode_contents() for td in td_lst]
    return td_val_lst

def get_table_values(tbl):
    val_lst = [get_row_values(tr) for tr in tbl.find_all('tr')]
    val_lst = [r for r in val_lst if len(r) == 3]
    val_lst = [r for r in val_lst if r != ['Day', 'Date', 'Spaces']]
    return val_lst

def process_table(tbl):

    mth_str = tbl.find('th').decode_contents()
    
    row_lst = get_table_values(tbl)

    out_lst = [[datetime.strptime(f'{int(r[1]):02d} {mth_str} {yr}', '%d %B %Y').date().isoformat(), r[2]] for r in row_lst]
    
    out_df = pd.DataFrame(out_lst, columns=['launch_date', 'available_user_days'])
    
    return out_df

In [5]:
res = requests.get(url)

res

<Response [200]>

In [6]:
soup = BeautifulSoup(res.text, 'html.parser')

In [11]:
tbl_lst = [tbl for tbl in soup.find_all('table') if tbl.attrs['summary'].startswith("Available Float Space Openings")]

In [12]:
today = datetime.today()
yr = today.year

yr

2023

In [13]:
df = pd.concat([process_table(tbl) for tbl in tbl_lst])

df

Unnamed: 0,launch_date,available_user_days
0,2023-05-31,64
0,2023-06-01,12
1,2023-06-02,22
2,2023-06-03,10
3,2023-06-04,11
4,2023-06-08,22
5,2023-06-11,1
6,2023-06-15,1


In [14]:
partition_dir_nm = f"retrieval_datetime={today.isoformat()}"

partition_dir_nm

'retrieval_datetime=2023-05-31T09:41:16.071591'

In [15]:
out_pth = out_dir / partition_dir_nm

out_pth

PosixPath('/Users/joel5174/projects/rogue-river-spaces-monitor/data/raw/rogue_retrieved_availability/retrieval_datetime=2023-05-31T09:41:16.071591')

In [16]:
if not out_pth.exists():
    out_pth.mkdir(parents=True)

In [17]:
out_prt = out_pth / f'part-{uuid4().hex}.csv'

out_prt

PosixPath('/Users/joel5174/projects/rogue-river-spaces-monitor/data/raw/rogue_retrieved_availability/retrieval_datetime=2023-05-31T09:41:16.071591/part-4242dc8a611d435cb337d09eb1adac5c.csv')

In [19]:
df.to_csv(out_prt, index=False)