In [52]:
# This script collects 2 days worth of data from little falls USGS gage

import requests 
from bs4 import BeautifulSoup as bsoup
import datetime
import pandas as pd
import re

# Get dates for url
today = str(datetime.datetime.now().date())
eairlier_day = str(datetime.datetime.now().date() - datetime.timedelta(days=2))


# Gage Sites
littlefalls = '01646500'
senecacreek = '01645000'
pointofrocks = '01644148'
# list to hold variables
gage_list = [littlefalls, senecacreek, pointofrocks]

# Parameters (measure devices on guages)
gage_hght = '00065'
discharge = '00060'
turbidity = '63680'
dissolved_o2 = '00300'
nitrate = '99133'
ph = '00400'
cunductabce = '00095'
tempF = '00010'
# list to hold variables
parameter_list = [gage_hght, discharge, turbidity, dissolved_o2, nitrate, ph, cunductabce, tempF]

# df to hold data
df = pd.DataFrame()
# temporary df
temp_df = pd.DataFrame()



# Retrieve and clean HTML - return in df
def get_data_to_df(gage,parameter,eairlier_day,today):
    url = 'https://waterservices.usgs.gov/nwis/iv/?sites=' + gage +\
                    '&parameterCd=' + parameter + '&startDT=' + eairlier_day + 'T' + '12:00:01.288-05:00' +\
                    '&endDT=' + today + 'T12:00:01.288-05:00' + '&siteStatus=all&format=rdb'
    req = requests.get(url)
    if req.status_code == 200: # if webpage request is good...
        datahtml = str(req.content)
        if re.search('No sites found', datahtml): # if no gage or parameter found...
            print('No parameter found')
            df_func = pd.DataFrame()
        else:
            datahtml = clean_HTML_data(datahtml) # clean HTML function
            df_func = pd.DataFrame(datahtml, columns=['USGS','ID','Date','Tz', gage+'-'+parameter,'P']) # name columns
            df_func = df_func.drop(columns=['USGS','ID','Tz','P']) 
    else:
        print('nope')
        

    return df_func


# function to clean up HTML data
def clean_HTML_data(html_data):
    location_of_data_start_in_html = re.search('10s', html_data) # find location of start of data
    location_of_data_start_in_html = location_of_data_start_in_html.span()
    html_data = html_data[(location_of_data_start_in_html[1]+2):] # add 2 to skip some slashes
    html_data = html_data[2:]
    html_data = html_data.replace('\\t' , ',')
    html_data = html_data.split('\\n')
    html_data = [sub.split(',') for sub in html_data]
    html_data = html_data[:-1]
    return html_data


df = get_data_to_df(gage_list[0],parameter_list[0],eairlier_day,today)
df = df.drop(df.columns[1], axis=1) # removes data and leave just Dates


#Loop to cycle through every listed gage and parameter measurement
for gage in gage_list: 
    for parameter in parameter_list:
        print('Trying to merge '+gage+'-'+parameter)
        temp_df = get_data_to_df(gage,parameter,eairlier_day,today)
        if temp_df.empty:
            pass
        else:
            df = df.merge(temp_df)






Trying to merge 01646500-00065
Trying to merge 01646500-00060
Trying to merge 01646500-63680
Trying to merge 01646500-00300
Trying to merge 01646500-99133
Trying to merge 01646500-00400
Trying to merge 01646500-00095
Trying to merge 01646500-00010
Trying to merge 01645000-00065
Trying to merge 01645000-00060
Trying to merge 01645000-63680
No parameter found
Trying to merge 01645000-00300
No parameter found
Trying to merge 01645000-99133
No parameter found
Trying to merge 01645000-00400
No parameter found
Trying to merge 01645000-00095
No parameter found
Trying to merge 01645000-00010
No parameter found
Trying to merge 01644148-00065
Trying to merge 01644148-00060
No parameter found
Trying to merge 01644148-63680
No parameter found
Trying to merge 01644148-00300
No parameter found
Trying to merge 01644148-99133
No parameter found
Trying to merge 01644148-00400
No parameter found
Trying to merge 01644148-00095
No parameter found
Trying to merge 01644148-00010
No parameter found


In [19]:
for gage in gage_list: 
    for parameter in parameter_list:
        

01646500
00065
01646500
00060
01646500
63680
01646500
00300
01646500
99133
01646500
00400
01646500
00095
01646500
00010F
01645000
00065
01645000
00060
01645000
63680
01645000
00300
01645000
99133
01645000
00400
01645000
00095
01645000
00010F
01644148
00065
01644148
00060
01644148
63680
01644148
00300
01644148
99133
01644148
00400
01644148
00095
01644148
00010F


In [51]:
df

Unnamed: 0,Date,01646500-00065,01646500-00060,01646500-63680,01646500-00300,01646500-99133,01646500-00400,01646500-00095,01646500-00010,01645000-00065,01645000-00060,01644148-00065
0,2023-02-14 12:15,3.61,6760,8.1,12.5,1.26,8.3,314,6.0,2.44,161,4.52
1,2023-02-14 12:30,3.61,6760,7.8,12.5,1.26,8.3,314,6.1,2.44,161,4.52
2,2023-02-14 12:45,3.61,6760,8.9,12.5,1.25,8.3,314,6.1,2.45,166,4.52
3,2023-02-14 13:00,3.61,6760,7.6,12.6,1.25,8.3,314,6.3,2.45,166,4.52
4,2023-02-14 13:15,3.61,6760,7.6,12.7,1.25,8.3,314,6.4,2.45,166,4.52
...,...,...,...,...,...,...,...,...,...,...,...,...
187,2023-02-16 11:00,3.82,8550,3.8,11.6,1.36,8.6,320,8.5,2.27,106,5.20
188,2023-02-16 11:15,3.82,8550,3.9,11.6,1.37,8.5,321,8.5,2.27,106,5.20
189,2023-02-16 11:30,3.82,8550,4.1,11.6,1.37,8.5,321,8.6,2.28,108,5.20
190,2023-02-16 11:45,3.82,8550,3.8,11.7,1.37,8.5,321,8.6,2.29,111,5.20


In [107]:
url = 'https://waterservices.usgs.gov/nwis/iv/?sites=01646500&parameterCd=00300&startDT=2023-01-20T10:19:11.967-05:00&endDT=2023-01-27T10:19:11.967-05:00&siteStatus=all&format=rdb'
#29 lines? find the 29th \n location and use that to trim
dftest = pd.DataFrame()
templist = []
req = requests.get(url)
datahtml = str(req.content[1427:]) # remove first characters
datahtml = datahtml[2:]
datahtml = datahtml.replace('\\t' , ',')
datahtml = datahtml.split('\\n')
datahtml = [sub.split(',') for sub in datahtml]
datahtml = datahtml[:-1]
temp_dftest = pd.DataFrame(datahtml, columns=['USGS','ID','Date','Tz', gage+'_'+parameter,'P'])
# templist = temp_dftest.iloc[:,4]
# templist

In [100]:
dftest = pd.concat([dftest, templist], axis=1, copy=False)

In [13]:
temp_dftest

NameError: name 'temp_dftest' is not defined

In [146]:
url = 'https://waterservices.usgs.gov/nwis/iv/?sites=01646500&parameterCd=00300&startDT=2023-01-20T10:19:11.967-05:00&endDT=2023-01-27T10:19:11.967-05:00&siteStatus=all&format=rdb'
#29 lines? find the 29th \n location and use that to trim
dftest = pd.DataFrame()
templist = []
req = requests.get(url)
htmlstr = str(req.content)

location_of_data_start_in_html = re.search('10s', htmlstr)
location_of_data_start_in_html = location_of_data_start_in_html.span()
htmlstr[(location_of_data_start_in_html[1]+2):]




'USGS\\t01646500\\t2023-01-20 10:30\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 10:45\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 11:00\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 11:15\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 11:30\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 11:45\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 12:00\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 12:15\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 12:30\\tEST\\t12.7\\tP\\nUSGS\\t01646500\\t2023-01-20 12:45\\tEST\\t12.8\\tP\\nUSGS\\t01646500\\t2023-01-20 13:00\\tEST\\t12.8\\tP\\nUSGS\\t01646500\\t2023-01-20 13:15\\tEST\\t12.9\\tP\\nUSGS\\t01646500\\t2023-01-20 13:30\\tEST\\t12.9\\tP\\nUSGS\\t01646500\\t2023-01-20 13:45\\tEST\\t12.9\\tP\\nUSGS\\t01646500\\t2023-01-20 14:00\\tEST\\t13.0\\tP\\nUSGS\\t01646500\\t2023-01-20 14:15\\tEST\\t13.0\\tP\\nUSGS\\t01646500\\t2023-01-20 14:30\\tEST\\t13.2\\tP\\nUSGS\\t01646500\\t2023-01-20 14:45\\tEST\\t13.3\\tP\\nUSGS\\t01646500\\t2023-01-2

In [126]:
htmlstr

