In [None]:
# reading in USAF site information - from solar radiation dataset
df_sites = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/stations_info_9110.csv')
df_sites.head()

# select only class 1 stations (see NSRDB manual p.7-8 for more details)
df_class1 = df_sites[(df_sites['CLASS'] == 1)]
sites_class1 = list(df_class1.USAF) # station list with class 1 quality

In [None]:
%%time

# timing related settings
years = np.arange(1991, 2011) # timeframe in which we have weather data
dateparse = lambda dates: [datetime.datetime.strptime(d, "%Y%m%d%H") for d in dates] # dateparsing method to be used in pd.read_fwf
season_start, season_end = '03-01-', '11-30-' # setting a pretty borad range for growing season

# setting up np.read_fwf arguments
colnames = ['time', 'temp', 'temp_quality', 'dew_temp', 'dtemp_quality', 'precip', 
            'precip_time', 'precip_depth', 'precip_quality', 'precip_perhr', 'rh']
colspecs = [(15,25), (87,92), (92,93), (93,98), (98,99), (105,8193)]

# empty dataframes to store data from all site-years
df_temp_all = pd.DataFrame()
df_rh_all = pd.DataFrame()
df_precip_all = pd.DataFrame()

# reading in all weather data and storing as dataframe
for year in years:
    print(year) # output to track code progress
    times = pd.date_range(season_start + str(year), season_end + str(year) + ' 23:00:00', freq='1H')
    
    # creating dataframes to store all site data for an individual year
    df_temp_sites = pd.DataFrame(index=times)
    df_rh_sites = pd.DataFrame(index=times)
    df_precip_sites = pd.DataFrame(index=times)
    
    for site in sites_class1:
        # selecting for file associated with specified site
        file = glob.glob('/home/disk/eos8/ach315/data/ISH/' + str(year) + '/' + str(site) + '-*')
        
        if len(file) == 0: # when specified site does not exist for current year
            continue # skip the following code and move on to the next site in the for loop
        elif len(file) == 1:
            name = file[0]
        else: # when specified USAF site has more than one WBAN ID, resulting in more than one unique site
            print('choosing from multiple files: ', file)
            name = glob.glob('/home/disk/eos8/ach315/data/ISH/' + str(year) + '/' + str(site) + '-99999-*')[0]
            # for cases when a USAF station ID is linked to two WBAN IDs, select the one in which WBAN is listed as 99999
            
        # reading in raw weather data as fixed-width data format
        df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, index_col='time',
                         encoding='latin_1', dtype={'temp':int, 'precip':str}, 
                         parse_dates=True, date_parser=dateparse)
        # remove duplicated hours, keeping only the first measurement per hour
        df = df[df.index.duplicated(keep='first') == False]

        # add in missing time values (corrects for leap years) and keeps only growing season
        df = df.reindex(times, fill_value=np.nan)

        # finding precip data
        df.precip_time = df[df['precip'].str.find('ADDAA1')!=-1]['precip'].str.split('ADDAA1').str.get(1).str.slice(0,2).astype(float)
        df.precip_depth = df[df['precip'].str.find('ADDAA1')!=-1]['precip'].str.split('ADDAA1').str.get(1).str.slice(2, 6).astype(float)
        df.precip_quality = df[df['precip'].str.find('ADDAA1')!=-1]['precip'].str.split('ADDAA1').str.get(1).str.slice(7,8)
                
        # filtering out weather data based on quality code (data manual p.26)
        # removing data with code 3 (Erroneous) or 7 (Erroneous, data originate from an NCEI data source)
        # - temp
        quality_temp = (df.temp_quality=='3') | (df.temp_quality=='7')
        rows_temp = df[quality_temp].index
        df.loc[rows_temp, 'temp'] = np.nan
        # - dew temp
        quality_dtemp = (df.dtemp_quality=='3') | (df.dtemp_quality=='7')
        rows_dtemp = df[quality_dtemp].index
        df.loc[rows_dtemp, 'dew_temp'] = np.nan
        # - precip
        quality_precip = (df.precip_quality=='3') | (df.precip_quality=='7')
        rows_precip = df[quality_precip].index
        df.loc[rows_precip, 'precip'] = np.nan

        # replacing missing values with NANs                    
        df.temp = df.temp.replace({9999: np.nan})
        df.dew_temp = df.dew_temp.replace({9999: np.nan})
        df.precip_time = df.precip_time.replace({99: np.nan})
        df.precip_depth = df.precip_depth.replace({9999: np.nan})

        # calculating hourly precip depth
        df.precip_perhr = df.precip_depth/df.precip_time
        df.precip_perhr = df.precip_perhr.replace({np.inf: np.nan}) # accounting for cases where precip_hr = 0
                                                                    # which produces infinite precip_perhr

        # converting units 
        df.temp = df.temp/10
        df.dew_temp = df.dew_temp/10
        df.precip_perhr = df.precip_perhr/10

        # calculating RH through Clausius Clapeyron
        df.rh = CC(df.temp, df.dew_temp)*100
        if df[df.rh>100].rh.sum() > 100:
            print(site, year)

        # combining weather data into individual dataframes
        df_temp = pd.DataFrame({site: df.temp}, index= times)
        df_rh = pd.DataFrame({site: df.rh}, index=times)
        df_precip = pd.DataFrame({site: df.precip_perhr}, index=times)

        df_temp_sites = pd.concat([df_temp_sites, df_temp], axis= 1, sort=True)
        df_rh_sites = pd.concat([df_rh_sites, df_rh], axis=1, sort=True)
        df_precip_sites = pd.concat([df_precip_sites, df_precip], axis=1, sort=True)       

    # combining all site-years data together
    df_temp_all = pd.concat([df_temp_all, df_temp_sites], sort=True)
    df_rh_all = pd.concat([df_rh_all, df_rh_sites], sort=True)
    df_precip_all = pd.concat([df_precip_all, df_precip_sites], sort=True)

#df_temp_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/temp_9110_class1.csv')
#df_precip_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/precip_9110_class1.csv')
#df_rh_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/rh_9110_class1.csv')