In [1]:
#import modules and data, drop derived fields
import pandas as pd
import numpy as np
vdot = pd.read_csv(r"cleanrwis.csv")
droplist = ['Unnamed: 0', 'concrete_yn_ground', 'concrete_yn_surf',
       'concrete_yn_air', 'concrete_over_yn_ground', 'concrete_over_yn_surf',
       'concrete_over_yn_air', 'epoxy_yn_ground', 'epoxy_over_yn_surf',
       'epoxy_over_yn_air', 'shotcrete_yn_ground', 'shotcrete_yn_surf',
       'shotcrete_yn_air', 'silicone_yn_ground', 'silicone_yn_surf',
       'silicone_yn_air', 'coatings_yn_ground', 'coatings_over_yn_surf',
       'coatings_over_yn_air', 'polymer_yn_ground', 'polymer_over_yn_surf',
       'polymer_over_yn_air', 'avg_groundtemp', 'avg_surftemp', 'surfstate_ss1_score',
       'surfstate_ss2_score', 'surfstate_ss3_score', 'surfstate_ss4_score',
       'surfstate_master']
vdot = vdot.drop(droplist, axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#filter out stations with less than 95% data coverage for airtemp and rainonoff
goodstations = []
okstations = []
for num in vdot.groupby('stationid'):
    if ((vdot[vdot['stationid'] == num[0]].airtemp.isna().sum()/len(vdot[vdot['stationid'] == num[0]]))) <= .05 and (vdot[vdot['stationid'] == num[0]].rainonoff.isna().sum()/len(vdot[vdot['stationid'] == num[0]])) <= .05:
        goodstations.append(num[0])
    if ((vdot[vdot['stationid'] == num[0]].airtemp.isna().sum()/len(vdot[vdot['stationid'] == num[0]]))) <= .2 and ((vdot[vdot['stationid'] == num[0]].airtemp.isna().sum()/len(vdot[vdot['stationid'] == num[0]]))) > .05 and (vdot[vdot['stationid'] == num[0]].rainonoff.isna().sum()/len(vdot[vdot['stationid'] == num[0]])) <= .2 and (vdot[vdot['stationid'] == num[0]].rainonoff.isna().sum()/len(vdot[vdot['stationid'] == num[0]])) > .05:
        okstations.append(num[0])
        




In [3]:
#only included the 37 stations with good data coverage
vdot = vdot[vdot['stationid'].isin(goodstations)]

In [4]:
#determine average hourly precipitation rate for vdot observations where rainonoff = on - for use with NOAA data
vdot[vdot['rainonoff'] == 1].rainperhr.mean()

nan

In [5]:
#convert time column to datetime and set to dataframe index (datetime index)
vdot.time = pd.to_datetime(vdot.time)
vdot = vdot.set_index('time')

In [6]:
#convert rainonoff field to integers

maskoff = vdot['rainonoff'].str.match('off', na=False)
maskon = vdot['rainonoff'].str.match('on', na=False)

vdot.loc[maskoff, 'rainonoff'] = 0
vdot.loc[maskon, 'rainonoff'] = 1

vdot['rainonoff'] = pd.to_numeric(vdot['rainonoff'])

In [7]:
#create flagging function to tag observations as good or bad based on rules:
    #if any of the following: 
        #observation has rain detected, temperature below 40 or above 95
            #day is flagged as bad (0)
def flagger(df):
    if (df['rainonoff'] == 1) or (df['airtemp'] < 40) or (df['airtemp'] > 95):
        return 0
    else:
        return 1
    
vdot['goodobs'] = vdot.apply(flagger, axis=1)

In [8]:
vdot['stationid'] = vdot['stationid'].apply(str)

In [14]:
#for each station:
# 1. resample to 6 hour time periods with sum function, getting total good observations per 6 hour period
# 2. divide sum of positive responses per period by total number of observations per period
# 3. 6 hour periods which consist of more than 50% "good" observations are "good"
# 4. iterate through rows and check for 6 hour buffer: if "good" 6 hr period is preceded by another, it gets a 1 - else 0
stationlist = []
for num in vdot.groupby('stationid'):
    vdotre = vdot[vdot['stationid'] == num[0]].resample('6H').sum()
    vdotrecount = vdot[vdot['stationid'] == num[0]].resample('6H').size()
    vdotre['sixhrprop'] = vdotre['goodobs'] / vdotrecount
    vdotre['sixhourgood'] = [1 if x > 0.5 else 0 for x in vdotre['sixhrprop']]
    sixhoursbuffgood = []
    for i in range(0, len(vdotre)):
        if vdotre.iloc[i].sixhourgood == 1 and vdotre.iloc[i-1].sixhourgood == 1:
            sixhoursbuffgood.append(1)
        else:
            sixhoursbuffgood.append(0)    
    vdotre['sixhoursbuffgood'] = sixhoursbuffgood
    vdotre['RWIS_CODE'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['RWIS_CODE'][0]
    vdotre['RWIS_KEY'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['RWIS_KEY'][0]
    vdotre['RWIS_NAME'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['RWIS_NAME'][0]
    vdotre['LATITUDE'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['LATITUDE'][0]
    vdotre['LONGITUDE'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['LONGITUDE'][0]
    vdotre['REGION_NAME'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['REGION_NAME'][0]
    vdotre['DISTRICT_NAME'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['DISTRICT_NAME'][0]
    vdotre['COUNTY_NAME'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['COUNTY_NAME'][0]
    vdotre['alt_ft'] = vdot[vdot['stationid'] == num[0]].reset_index().drop('time', axis=1)['alt_ft'][0]
    vdotre = vdotre[['goodobs', 'sixhrprop', 'sixhourgood', 'sixhoursbuffgood', 'RWIS_CODE', 'RWIS_KEY', 'RWIS_NAME', 'LATITUDE', 'LONGITUDE', 'REGION_NAME', 'DISTRICT_NAME', 'COUNTY_NAME', 'alt_ft']]
    #vdotre['sixhoursbuffgood'].to_csv('station'+num[0]+'.csv')
    stationlist.append(vdotre)
    print("station", num[0], vdotre.sixhoursbuffgood.value_counts())
out = pd.concat(stationlist)
out.to_csv('vdot_out.csv')



    
    

station 0 1    4357
0    2947
Name: sixhoursbuffgood, dtype: int64
station 1 1    5098
0    2198
Name: sixhoursbuffgood, dtype: int64
station 10 1    5263
0    2041
Name: sixhoursbuffgood, dtype: int64
station 11 1    5066
0    2238
Name: sixhoursbuffgood, dtype: int64
station 16 1    4551
0    2753
Name: sixhoursbuffgood, dtype: int64
station 17 0    4476
1    2828
Name: sixhoursbuffgood, dtype: int64
station 18 1    5298
0    2006
Name: sixhoursbuffgood, dtype: int64
station 2 1    4552
0    2752
Name: sixhoursbuffgood, dtype: int64
station 23 1    5296
0    2008
Name: sixhoursbuffgood, dtype: int64
station 24 0    5075
1    1861
Name: sixhoursbuffgood, dtype: int64
station 25 0    4666
1    2638
Name: sixhoursbuffgood, dtype: int64
station 26 0    3865
1    3439
Name: sixhoursbuffgood, dtype: int64
station 28 0    3941
1    3363
Name: sixhoursbuffgood, dtype: int64
station 30 0    4947
1    2357
Name: sixhoursbuffgood, dtype: int64
station 34 1    5032
0    2272
Name: sixhoursbuffgo