In [102]:
from datetime import datetime
from uuid import uuid4

from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import requests

In [2]:
url = "https://www.blm.gov/or/resources/recreation/rogue/rogue_river.php"

In [98]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'
dir_raw = dir_data / 'raw'

out_dir = dir_raw / 'rogue_retrieved_availability'

In [95]:
def get_row_values(tr):
    td_lst = tr.find_all('td')
    td_val_lst = [td.decode_contents() for td in td_lst]
    return td_val_lst

def get_table_values(tbl):
    val_lst = [get_row_values(tr) for tr in tbl.find_all('tr')]
    val_lst = [r for r in val_lst if len(r) == 3]
    val_lst = [r for r in val_lst if r != ['Day', 'Date', 'Spaces']]
    return val_lst

def process_table(tbl):

    mth_str = tbl.find('th').decode_contents()
    
    row_lst = get_table_values(tbl_lst[0])

    out_lst = [[datetime.strptime(f'{int(r[1]):02d} {mth_str} {yr}', '%d %B %Y').date().isoformat(), r[2]] for r in row_lst]
    
    out_df = pd.DataFrame(out_lst, columns=['launch_date', 'available_user_days'])
    
    return out_df

In [3]:
res = requests.get(url)

res

<Response [200]>

In [4]:
soup = BeautifulSoup(res.text, 'html.parser')

In [5]:
tbl_lst = [tbl for tbl in soup.find_all('table') if tbl.attrs['summary'].startswith("Available Float Space Openings")]

In [68]:
today = datetime.today()
yr = today.year

yr

2023

In [96]:
df = pd.concat([process_table(tbl) for tbl in tbl_lst])

df

Unnamed: 0,launch_date,available_user_days
0,2023-05-15,73
1,2023-05-16,79
2,2023-05-17,35
3,2023-05-18,21
4,2023-05-19,7
5,2023-05-20,26
0,2023-06-15,73
1,2023-06-16,79
2,2023-06-17,35
3,2023-06-18,21


In [97]:
partition_dir_nm = f"retrieval_datetime={today.isoformat()}"

partition_dir_nm

'retrieval_datetime=2023-05-09T06:20:42.765761'

In [100]:
out_pth = out_dir / partition_dir_nm

out_pth

PosixPath('/Users/joel5174/projects/rogue-river-spaces-monitor/data/raw/rogue_retrieved_availability/retrieval_datetime=2023-05-09T06:20:42.765761')

In [101]:
if not out_pth.exists():
    out_pth.mkdir(parents=True)

In [104]:
out_prt = out_pth / f'part-{uuid4().hex}.csv'

out_prt

PosixPath('/Users/joel5174/projects/rogue-river-spaces-monitor/data/raw/rogue_retrieved_availability/retrieval_datetime=2023-05-09T06:20:42.765761/part-a29ba852f9864afa9e9a4c026346a6c9.csv')

In [105]:
df.to_csv(out_prt)