# Weather file - raw data process
- Main task: reading in raw weather data from two sources and combine all output into .csv format <br/>
- Data source: 
    1. NOAA Integrated Surface Data (ISH): https://www.ncdc.noaa.gov/isd/data-access
    2. National Solar Radiation Database (NSRD): https://rredc.nrel.gov/solar/old_data/nsrdb/
- Main output: 
    1. met data: df_temp_all, df_rh_all, df_precip_all
    2. solarad data: df_solrad_all

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import time 
import datetime

## 1. Hourly Meteorology Data

### Step 1.1: create function that calculates RH with temperature and dew point temperatur
This list is called: weafile_list

In [6]:
# Creating Clausius-Clapeyron function

def CC(temp, temp_dew):
    """
    function that calculates relative humidity with temperature and dew point temperature
    temperautre input units: ˚C
    """
    # constant parameters
    Tref = 273.15  # reference temperature
    Es_Tref = 6.11 # saturation vapor pressure at reference temperature
    Lv = 2.5e+06   # latent heat of vaporation (J/kg)
    Rv = 461       # gas constant for moist air (J/kg)
    
    # transformed temperature inputs
    Tair = temp + Tref
    Tdew = temp_dew + Tref
    
    # Clausius-Clapeyron relation
    es = Es_Tref*np.exp((Lv/Rv)*(1/Tref - 1/Tair))
    e = Es_Tref*np.exp((Lv/Rv)*(1/Tref - 1/Tdew))
    rh = round(e/es,4)
    
    return(rh)

### Step 1.2: read in individual site-year the weather files and parse out data needed
Parce out relevant info and store weather data into individual pd.DataArrays. <br/>
Relevant weather data include:
- date/time: need to convert into datetime 64 format
- temperature
- dew temperature
- RH
- precipitaiton


In [7]:
count_start = datetime.datetime.now()

# timing related settings
years = np.arange(1961, 1991) # timeframe in which we have weather data
dateparse = lambda dates: [pd.datetime.strptime(d, "%Y%m%d%H") for d in dates] # dateparsing method to be used in pd.read_fwf
season_start, season_end = '03-01-', '11-30-' # setting a pretty borad range for growing season

# setting up np.read_fwf arguments
colnames = ['time', 'temp', 'dew_temp', 'precip', 'precip_time', 'precip_depth', 'precip_condition', 'precip_quality', 'rh']
colspecs = [(15,25), (87,92), (93,98), (105,8193)]

# empty dataframes to store data from all site-years
df_temp_all = pd.DataFrame()
df_rh_all = pd.DataFrame()
df_precip_all = pd.DataFrame()

# reading in all weather data and storing as dataframe
for year in years:
    print(year) # output to track code progress
    times = pd.date_range(season_start + str(year), season_end + str(year), freq='1H')
    fnames = glob.glob('/home/disk/eos8/ach315/data/ISH/' + str(year) + '/*')
    
    # creating dataframes to store all site data for an individual year
    df_temp_sites = pd.DataFrame(index=times)
    df_rh_sites = pd.DataFrame(index=times)
    df_precip_sites = pd.DataFrame(index=times)
    
    for name in fnames:
        # WBAN site name 
        site_id = name.split('/')[-1].split('-')[-2]
        
        # read in individual files
        df = pd.read_fwf(name, names=colnames, colspecs=colspecs, header=None, index_col='time',
                         encoding='latin_1', dtype={'temp':int, 'precip':str}, parse_dates=True, date_parser=dateparse)
    
        # remove duplicated hours, keeping only the first measurement per hour
        df = df[df.index.duplicated(keep='first') == False]
        
        # add in missing time values (corrects for leap years) and keeps only growing season
        df = df.reindex(times, fill_value=np.nan)
        
        # finding precip data
        try:
            df.precip_time = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(0,2).astype(float)
            df.precip_depth = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(2, 6).astype(float)
            df.precip_condition = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(6,7).astype(float)
            df.precip_quality = df[df['precip'].str.find('AA1')!=-1]['precip'].str.split('AA1').str.get(1).str.slice(7,8).astype(float)
        except: 
            print(year, name)
                
        # replacing missing values (9999) with NANs 
        df.temp = df.temp.replace({9999: np.nan})
        
        # converting units 
        df.temp = df.temp/10
        df.dew_temp = df.dew_temp/10
        df.precip_depth = df.precip_depth/10
        
        # calculating RH through Clausius Clapeyron
        df.rh = CC(df.temp, df.dew_temp)*100

        # Combining weather data into individual dataframes
        df_temp = pd.DataFrame({site_id: df.temp}, index= times)
        df_rh = pd.DataFrame({site_id: df.rh}, index=times)
        df_precip = pd.DataFrame({site_id: df.precip_depth}, index=times)
        
        df_temp_sites = pd.concat([df_temp_sites, df_temp], axis= 1, sort=True)
        df_rh_sites = pd.concat([df_rh_sites, df_rh], axis=1, sort=True)
        df_precip_sites = pd.concat([df_precip_sites, df_precip], axis=1, sort=True)

    # combining all site-years data together
    df_temp_all = pd.concat([df_temp_all, df_temp_sites], sort=True)
    df_rh_all = pd.concat([df_rh_all, df_rh_sites], sort=True)
    df_precip_all = pd.concat([df_precip_all, df_precip_sites], sort=True)

count_end = datetime.datetime.now()
diff = count_end - count_start
print('run time:', diff)

1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1973 /home/disk/eos8/ach315/data/ISH/1973/725330-14827-1973
1973 /home/disk/eos8/ach315/data/ISH/1973/723530-13967-1973
1974
1974 /home/disk/eos8/ach315/data/ISH/1974/723721-23184-1974
1975
1975 /home/disk/eos8/ach315/data/ISH/1975/723676-23048-1975
1976
1976 /home/disk/eos8/ach315/data/ISH/1976/722446-93987-1976
1976 /home/disk/eos8/ach315/data/ISH/1976/724280-14821-1976
1976 /home/disk/eos8/ach315/data/ISH/1976/912120-41415-1976
1977
1977 /home/disk/eos8/ach315/data/ISH/1977/724280-14821-1977
1977 /home/disk/eos8/ach315/data/ISH/1977/726835-24230-1977
1978
1978 /home/disk/eos8/ach315/data/ISH/1978/723815-23161-1978
1979
1979 /home/disk/eos8/ach315/data/ISH/1979/723401-13963-1979
1980
1980 /home/disk/eos8/ach315/data/ISH/1980/723815-23161-1980
1981
1981 /home/disk/eos8/ach315/data/ISH/1981/723815-23161-1981
1982
1983
1983 /home/disk/eos8/ach315/data/ISH/1983/726430-14920-1983
1984
1985
1986
1987
1988
1989
1990
run time: 

### Step 1.3: Output the processed weather data into individual .csv files

In [9]:
df_temp_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/temp_all.csv')
df_precip_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/precip_all.csv')
df_rh_all.to_csv('/home/disk/eos8/ach315/upscale/weadata/rh_all.csv')

## 2. Hourly Solar Radiation Data

NSRD (National Solar Radiation Database): https://rredc.nrel.gov/solar/old_data/nsrdb/

### Step 2.1: read in solar radiation data

In [None]:
# JH 200504: depricated code, trying to update this below:
#file_list = pd.read_csv('/home/disk/eos8/ach315/data/ISH_NSRD/file_list_1961to1990.txt', header= None, squeeze= True)
file_list = glob.glob('/home/disk/eos8/ach315/data/ISH_NSRD/1961to1990/*')
file_list = file_list[7096:]
sites = pd.read_csv("/home/disk/eos8/ach315/data/ISH_NSRD/station_list_1961to1990.txt", header= None, squeeze= True, dtype= str)

for i in file_list:    
    solrad_file = pd.read_table("/home/disk/eos8/ach315/data/ISH_NSRD/1961to1990/" + 
                                str(i), squeeze= True, header= None)
    WBAN_id = i[0:5]
    solrad_all = list()
    timestamp_all = list()   
        
    for j in np.arange(1, len(solrad_file)):
        solrad = solrad_file[j][23:27]
        if solrad == 9999:
            solrad_all.append("NaN")
        else:
            solrad_all.append(solrad)
        
        year = "19" + solrad_file[j][1:3]
        month = solrad_file[j][4:6]
        day = solrad_file[j][7:9]
        hour = solrad_file[j][10:12]
        timestamp = dt.datetime(int(year), int(month), int(day), int(hour)-1) # hour must be in 0-23
        timestamp = pd.to_datetime(timestamp)        
        timestamp_all.append(timestamp)
                    
    df_solrad_all.loc[timestamp_all, WBAN_id] = solrad_all

## 200504 - working on modifying this now
200505: currently this code seems to run alright but is far too slow, need to figure out how to speed up <br/>
I think the problem might be that everytime you append to the big dataframe it takes a lot of time <br/>
I'm thinking to follow the strucutre I had with the met data, to read in solrad files in batches of years, <br/>
and concate data from each year after the previous one

In [None]:
%%time

# reading in raw solar radiation data
file_list = glob.glob('/home/disk/eos8/ach315/data/ISH_NSRD/1961to1990/*')

# setting up empty dataframe to populate
#times = pd.date_range('01-01-1961', '12-31-1990 23:00:00', freq='1H')
#sites = pd.read_csv('/home/disk/eos8/ach315/data/ISH_NSRD/station_list_1961to1990.txt', 
#                    header=None, dtype=str, squeeze=True)
#df_solrad_all = pd.DataFrame(index=times, columns=sites)
df_solrad_all = pd.DataFrame()

# read in individual data file
for i in file_list:
    print(i.split('/')[-1]) # tracking code progress
    year = int('19' + str(i.split('/')[-1].split('_')[-1].split('.')[0]))
    WBAN_id = str(i.split('/')[-1].split('_')[0])
    df = pd.read_fwf(i, skiprows=[0], header=None,
                     names = ['year', 'month', 'day', 'hour', 'solrad'], 
                     colspecs = [(1,3), (4,6), (7,9), (10,12), (23,27)])
    time = df.apply(lambda row: datetime.datetime(year, row['month'], row['day'], row['hour']-1), axis=1)
    solrad = df['solrad'] # Global Horizontal Radiation (Wh/m2)
    df_solrad = pd.DataFrame(solrad)
    df_solrad.columns = [WBAN_id]
    df_solrad.index = time
    df_solrad_all = df_solrad_all.append(df_solrad)

## 200506 - working on modifying this now
Going to try and update file reading strucutre to be similar to reading in met data <br/>
Read in batches by years.

In [None]:
%%time

# reading in raw solar radiation data
file_list = glob.glob('/home/disk/eos8/ach315/data/ISH_NSRD/1961to1990/*')

# setting up empty dataframe to populate
#times = pd.date_range('01-01-1961', '12-31-1990 23:00:00', freq='1H')
#sites = pd.read_csv('/home/disk/eos8/ach315/data/ISH_NSRD/station_list_1961to1990.txt', 
#                    header=None, dtype=str, squeeze=True)
#df_solrad_all = pd.DataFrame(index=times, columns=sites)
df_solrad_all = pd.DataFrame()

# read in individual data file
for i in file_list:
    print(i.split('/')[-1]) # tracking code progress
    year = int('19' + str(i.split('/')[-1].split('_')[-1].split('.')[0]))
    WBAN_id = str(i.split('/')[-1].split('_')[0])
    df = pd.read_fwf(i, skiprows=[0], header=None,
                     names = ['year', 'month', 'day', 'hour', 'solrad'], 
                     colspecs = [(1,3), (4,6), (7,9), (10,12), (23,27)])
    time = df.apply(lambda row: datetime.datetime(year, row['month'], row['day'], row['hour']-1), axis=1)
    solrad = df['solrad'] # Global Horizontal Radiation (Wh/m2)
    df_solrad = pd.DataFrame(solrad)
    df_solrad.columns = [WBAN_id]
    df_solrad.index = time
    df_solrad_all = df_solrad_all.append(df_solrad)

In [126]:
test = pd.merge(df_solrad_all, df_solrad, how='left', on=WBAN_id, left_index=True)

In [124]:
test

Unnamed: 0,03103,03812,03813,03820,03822,03856,03860,03870,03927,03928,...,94746,94814,94822,94823,94830,94846,94847,94849,94860,94910
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,
NaT,,,,,,,,,,,...,,,,,,,,,,


### testing testing 123

In [2]:
times = pd.date_range('01-01-1961', '12-31-1990 23:00:00', freq='1H')
sites = pd.read_csv('/home/disk/eos8/ach315/data/ISH_NSRD/station_list_1961to1990.txt', 
                    header=None, dtype=str, squeeze=True)
df_solrad_all = pd.DataFrame(index=times, columns=sites)

In [3]:
df_solrad_all.head()

Unnamed: 0,03103,03812,03813,03820,03822,03856,03860,03870,03927,03928,...,94746,94814,94822,94823,94830,94846,94847,94849,94860,94910
1961-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
1961-01-01 01:00:00,,,,,,,,,,,...,,,,,,,,,,
1961-01-01 02:00:00,,,,,,,,,,,...,,,,,,,,,,
1961-01-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
1961-01-01 04:00:00,,,,,,,,,,,...,,,,,,,,,,


In [4]:
file_list = glob.glob('/home/disk/eos8/ach315/data/ISH_NSRD/1961to1990/*')
i = file_list[0]
df = pd.read_fwf(i, skiprows=[0], header=None,
                 names = ['year', 'month', 'day', 'hour', 'solrad'], 
                 colspecs = [(1,3), (4,6), (7,9), (10,12), (23,27)])
year = int('19' + str(i.split('/')[-1].split('_')[-1].split('.')[0]))
WBAN_id = str(i.split('/')[-1].split('_')[0])
time = df.apply(lambda row: datetime.datetime(year, row['month'], row['day'], row['hour']-1), axis=1)
solrad = df['solrad'] # Global Horizontal Radiation (Wh/m2)
df_solrad = pd.DataFrame(solrad)
df_solrad.columns=[WBAN_id]
df_solrad.index=time
df_solrad.head()

Unnamed: 0,03103
1962-01-01 00:00:00,0
1962-01-01 01:00:00,0
1962-01-01 02:00:00,0
1962-01-01 03:00:00,0
1962-01-01 04:00:00,0


In [98]:
test = pd.concat([df_solrad_all, df_solrad], axis=1, join_axes=[df_solrad_all.index])

TypeError: concat() got an unexpected keyword argument 'join_axes'

In [None]:
# JH 190216
# Not sure why but all the last rows are 9999, and they were not converted into NaN
# I double checkted that this only happens in the very last row, but wans't able to fix the code yet.
# for not, here's a hot fix
df_solrad_all.iloc[-1, :] = "NaN"

#### Step 2.2: Output the processed solar radiation data into .csv file

In [None]:
df_solrad_all.to_csv("solrad_all.csv")

In [65]:
test = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/solrad_all.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,03103,03812,03813,03820,03822,03856,03860,03870,03927,...,94746,94814,94822,94823,94830,94846,94847,94849,94860,94910
0,1961-01-01 00:00:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1961-01-01 01:00:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1961-01-01 02:00:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1961-01-01 03:00:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1961-01-01 04:00:00,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
