In [25]:
import requests 
from bs4 import BeautifulSoup as bsoup
import datetime
import pandas as pd
import re
import numpy as np

In [2]:
littlefalls = '01646500' # DEPENDENT
senecacreek = '01645000' # not used
pointofrocks = '01638500'
edwards_ferry = '01644148' # not used
shepherdstown = '01618000'
pawpaw = '01610000' # not used
hancock = '01613000'
springfield = '01608500'
moorfield = '01608070' # not used

# list to hold variables
gage_list = [littlefalls, pointofrocks, shepherdstown, hancock, springfield]

In [3]:
def get_date_time():
    d = dict() # build dict
    
    d['today'] = str(datetime.datetime.now().date()) # get today date
    d['earlier_day'] = str(datetime.datetime.now().date() - datetime.timedelta(days=2)) # get today date
    
    
    d['time_now'] = str(datetime.datetime.now().time())[:-3] # get time, remove unused digits
    
    return d    

In [4]:
def clean_HTML_data(html_data):
    location_of_data_start_in_html = re.search('10s', html_data) # find location of start of data
    location_of_data_start_in_html = location_of_data_start_in_html.span()
    html_data = html_data[(location_of_data_start_in_html[1]+2):] # add 2 to skip some slashes
    html_data = html_data[2:]
    html_data = html_data.replace('\\t' , ',')
    html_data = html_data.split('\\n')
    html_data = [sub.split(',') for sub in html_data]
    html_data = html_data[:-1]
    return html_data

In [5]:
# https://waterservices.usgs.gov/nwis/iv/?sites=01646500&parameterCd=00065&startDT=2023-09-06T10:14:30.299-05:00&endDT=2023-09-13T10:14:30.299-05:00&siteStatus=all&format=rdb

def get_current_gage_heights(gage): # for gage in gage_list loop here
    date_time = get_date_time()
    url = 'https://waterservices.usgs.gov/nwis/iv/?sites=' + gage +\
        '&parameterCd=00065&startDT=' + date_time.get('earlier_day') + 'T' + date_time.get('time_now') + '-05:00' +\
        '&endDT=' + date_time.get('today') + 'T' + date_time.get('time_now') + '-05:00&siteStatus=all&format=rdb'                
    req = requests.get(url)
    if req.status_code == 200: # if webpage request is good...
        datahtml = str(req.content)
        if re.search('No sites found', datahtml): # if no gage or parameter found...
            print('No parameter found')
            df_func = pd.DataFrame()
        else:
            datahtml = clean_HTML_data(datahtml) # clean HTML function
            df_func = pd.DataFrame(datahtml, columns=['USGS','ID','Date','Tz', gage,'P']) # name columns
            df_func = df_func.drop(columns=['USGS','ID','Tz','P']) # drop unwanted variables
    else:
        print('request not good')
        
    return df_func
                

In [6]:
df = pd.DataFrame()
df = get_current_gage_heights(gage_list[0])
df = df.drop(df.columns[1], axis=1) # removes data and leave just Dates to merge onto
df

Unnamed: 0,Date
0,2023-10-04 11:45
1,2023-10-04 12:00
2,2023-10-04 12:15
3,2023-10-04 12:30
4,2023-10-04 12:45
...,...
180,2023-10-06 08:45
181,2023-10-06 09:00
182,2023-10-06 09:15
183,2023-10-06 09:30


In [18]:
df = pd.DataFrame()
df = get_current_gage_heights(gage_list[0])
df = df.drop(df.columns[1], axis=1) # removes data and leave just Dates to merge onto
df

#Loop to cycle through every listed gage and parameter measurement
for gage in gage_list: 
    print('Trying to merge '+ gage)
    temp_df = get_current_gage_heights(gage)
    if temp_df.empty:
        pass
    else:
        df = df.merge(temp_df, how='outer', left_on='Date', right_on='Date')

date_time = get_date_time()
print('Saving...')
df.to_csv(date_time.get('earlier_day') + '_throught_'+ date_time.get('today') + '_retrieve_' + '.csv')
print('Done')       

Trying to merge 01646500
Trying to merge 01638500
Trying to merge 01618000
Trying to merge 01613000
Trying to merge 01608500
Saving...
Done


In [None]:
# check all variables and see if they all have the the needed values
# if not, move the obs up one, until they all fit
# this would adjust time the prediction time as well since each shift back is 15mins

In [19]:
def reduce_to_needed_obs(df, shift):
    # shift needs positive value loop to retrieve data farther back in past
    gage_index_list = [[-69, -70, -71, -72], [-61], [-33], [-17]]
    data_list = [[],[],[],[]] # empty ist of list data will go into
    
    # Springfield gage
    data_list[0].append(df['01608500'].values[(gage_index_list[0][0]) - shift]) # 17hrs
    data_list[0].append(df['01608500'].values[(gage_index_list[0][1]) - shift])
    data_list[0].append(df['01608500'].values[(gage_index_list[0][2]) - shift])
    data_list[0].append(df['01608500'].values[(gage_index_list[0][3]) - shift])
    
    # Hancock gage
    data_list[1].append(df['01613000'].values[(gage_index_list[1][0]) - shift])
    
    # Shepardstown gage
    data_list[2].append(df['01618000'].values[(gage_index_list[2][0]) - shift])
    
    # Point of Rocks gage
    data_list[3].append(df['01638500'].values[(gage_index_list[3][0]) - shift])
    
    
    return data_list

In [22]:
predict_data_lol = reduce_to_needed_obs(df,0)
print(predict_data_lol)

[['1.16', '1.16', '1.16', '1.16'], ['2.91'], ['1.80'], ['1.19']]


In [35]:
# for creating np array for use in model predict

predict_input_17hr = np.array(predict_data_lol[0])
predict_input_15hr = np.array([predict_data_lol[0][0], predict_data_lol[1][0]])
predict_input_8hr = np.array([predict_data_lol[0][0], predict_data_lol[1][0], predict_data_lol[2][0]])
predict_input_4hr = np.array([predict_data_lol[0][0], predict_data_lol[1][0], predict_data_lol[2][0], predict_data_lol[3][0]])

In [36]:
predict_input_17hr
predict_input_15hr
predict_input_8hr
predict_input_4hr

array(['1.16', '2.91', '1.80', '1.19'], dtype='<U4')

In [27]:
# for converting data to array
y=np.array([np.array(xi) for xi in predict_data_lol])


  y=np.array([np.array(xi) for xi in predict_data_lol])


In [28]:
y

array([array(['1.16', '1.16', '1.16', '1.16'], dtype='<U4'),
       array(['2.91'], dtype='<U4'), array(['1.80'], dtype='<U4'),
       array(['1.19'], dtype='<U4')], dtype=object)