# Weather file - gap filling and formatting
- Main tasks: 
    1. gap fill weather file
    2. combine met and solrad info into single files for each site-year
    3. address timezone issue
    4. format weather file into MAIZSIM-readable format
- Data source: 
    1. weadata/**temp_all.csv**
    2. weadata/**rh_all.csv**
    3. weadata/**precip_all.csv**
    4. weadata/**solrad_all.csv**
- Main output: 
    - weadata/data/control/**site_year.txt** - weather file for all site-years

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import xarray as xr
import datetime
import time 
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader

from palettable.colorbrewer.sequential import OrRd_6
from palettable.colorbrewer.sequential import YlGn_9
from palettable.colorbrewer.sequential import YlGnBu_8
from palettable.colorbrewer.sequential import RdPu_5

#from funcs import find_zone, utc_to_local, CC_VPD

### Step 0: Reading in temperature, precip, RH & solar radiation data:
- Main input:
    - /weadata/**temp_all.csv**
    - /weadata/**rh_all.csv**
    - /weadata/**precip_all.csv**
    - /weadata/**solrad_all.csv**
- Main output: 
    - **df_temp, df_rh, df_precip, df_solrad**

#### 0.1 Read in weather data 1961-1990

In [297]:
# read in individual weather data
df_temp_6190 = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/temp_6190.csv', index_col=0)
df_rh_6190 = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/rh_6190.csv', index_col=0)
df_precip_6190 = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/precip_6190.csv', index_col=0)
df_solrad_6190 = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/solrad_6190.csv', index_col=0)

# re-index solar radiation data to only include growing season
df_solrad_6190 = df_solrad_6190.reindex(df_temp_6190.index)

# check that all met elements aligned - dataframe shape should match
print(df_temp_6190.shape)
print(df_rh_6190.shape)
print(df_precip_6190.shape)
print(df_solrad_6190.shape)

# convert station ID header from WBAN to USAF (in order to make continuous with 1991-2010)
df_stations = pd.read_csv('/home/disk/eos8/ach315/data/ISH_NSRD/stations_wban_usaf.csv', header=None, dtype='str')
df_stations.columns = ['WBAN', 'USAF']
sites_wban = list(df_temp_6190.columns)
sites_usaf = df_stations[df_stations['WBAN'].isin(sites_wban)]['USAF']

# assign new USAF headers
df_temp_6190.columns = sites_usaf; df_temp_6190 = df_temp_6190.sort_index(axis=1)
df_rh_6190.columns = sites_usaf; df_rh_6190 = df_rh_6190.sort_index(axis=1)
df_precip_6190.columns = sites_usaf; df_precip_6190 = df_precip_6190.sort_index(axis=1)
df_solrad_6190.columns = sites_usaf; df_solrad_6190 = df_solrad_6190.sort_index(axis=1)

(198000, 237)
(198000, 237)
(198000, 237)
(198000, 237)


#### 0.2 Read in weather data 1991-2010

In [294]:
## read in individual weather data
df_temp_9110 = pd.read_csv( '/home/disk/eos8/ach315/upscale/weadata/temp_9110_class1.csv', index_col=0)
df_rh_9110 = pd.read_csv( '/home/disk/eos8/ach315/upscale/weadata/rh_9110_class1.csv', index_col=0)
df_precip_9110 = pd.read_csv( '/home/disk/eos8/ach315/upscale/weadata/precip_9110_class1.csv', index_col=0)
df_solrad_9110 = pd.read_csv( '/home/disk/eos8/ach315/upscale/weadata/solrad_9110_class1.csv', index_col=0)

# re-index solar radiation data to only include growing season
df_solrad_9110 = df_solrad_9110.reindex(df_temp_9110.index)

# check that all met elements aligned - dataframe shape should match
print(df_temp_9110.shape)
print(df_rh_9110.shape)
print(df_precip_9110.shape)
print(df_solrad_9110.shape)

(132000, 241)
(132000, 241)
(132000, 241)
(132000, 242)


#### 0.3 Stitch together weather data

In [298]:
df_temp = pd.concat([df_temp_6190, df_temp_9110], axis=0, join='outer'); df_temp = df_temp.sort_index(axis=1)
df_rh = pd.concat([df_rh_6190, df_rh_9110], axis=0, join='outer'); df_rh = df_rh.sort_index(axis=1)
df_precip = pd.concat([df_precip_6190, df_precip_9110], axis=0, join='outer'); df_precip = df_precip.sort_index(axis=1)
df_solrad = pd.concat([df_solrad_6190, df_solrad_9110], axis=0, join='outer'); df_solrad = df_solrad.sort_index(axis=1)

print(df_temp.shape)
print(df_rh.shape)
print(df_precip.shape)
print(df_solrad.shape)

#df_temp.to_csv('/home/disk/eos8/ach315/upscale/weadata/temp_all.csv')
#df_rh.to_csv('/home/disk/eos8/ach315/upscale/weadata/rh_all.csv')
#df_precip.to_csv('/home/disk/eos8/ach315/upscale/weadata/precip_all.csv')
#df_solrad.to_csv('/home/disk/eos8/ach315/upscale/weadata/solrad_all.csv')

(330000, 274)
(330000, 274)
(330000, 274)
(330000, 274)


### Step 1. Figure out valid site-years that can be gap-filled
Selecting for site-years based on **crit_hrs** - consecutive missing hours of datapoints within raw data
- Main input: **df_temp, df_precip, df_solrad**
- Main output: **finalist**

In [300]:
%%time

# input variables for loop
datasets = list([df_temp, df_precip, df_solrad]) # weather datasets to process
                                                 # df_rh is based off df_temp, so no need to evaluate 

finalist = list([[], [], []]) # final lists to store processed output
                              # order: [0]-temp, [1]-precip, [2]-solrad

years = np.arange(1961, 2011) # years
growseason_start = '-03-01 00:00:00'
growseason_end = '-11-30 23:00:00' 

crit_hrs = 2 # critical hrs of missing data

# loop through temp, precip & solrad dataset to pick out usable site-years
for i in np.arange(len(datasets)):
    dataset = datasets[i]
    siteyears_all = list()
    sites = dataset.columns
    
    for j in years:
        start_time = str(j) + growseason_start
        end_time = str(j) + growseason_end
        siteyears = list()
        
        for k in sites:
            df = dataset.loc[start_time:end_time, k] 
            df = pd.DataFrame(df)
            df['group'] = df.notnull().astype(int) # df.notnull() returns TRUE or FALSE, 
                                                   # .astype(int) turns TRUE into 1, and FALSE into 0
            df['group'] = df.group.cumsum() # calculating cumulative sum 
            df = df[df.iloc[:,0].isnull()] # selecting out individual timesteps that have missing data
            nans_list = df.groupby('group')['group'].count() # counts the number of consecutive NANs 
            if nans_list[nans_list > crit_hrs].shape[0] == 0:
                use_siteyear = str(j) + '_' + str(k)
                siteyears.append(use_siteyear) # only record site-years that have fewer consecutive NANs than the critical value set

            # The logic of this section of code:
            # If weadata is absent (df.notnull == FALSE) you get a return of 0, thus,
            # df.group.cumsum() would not change the cumulative sum when encountering NANs since you're only adding 0.
            # By doing so, you end up with repeated cumsum() values when you have multiple NANs following it.
            # cumsum() values are documented in the 'group' column.
            # groupby('group') allows you to then group the cumsum() values into groups and document their counts. 
            # If a specific cumsum() values has counts greater than 1, that means there were NAN values that followed it.
            # The code then evaluates whether there were consecutive NAN values that exceeded the designated critical values.
            # If so, that site-years is excluded. 
        
        siteyears_all.extend(siteyears)
    
    finalist[i] = siteyears_all

CPU times: user 3min 32s, sys: 89 ms, total: 3min 32s
Wall time: 3min 32s


### Step 2. Compare usable site-years for temp  & precip and find the common year-sites
- Main intput: **finalist**
- Main output: **siteyears**

In [167]:
# assign output to individual siteyears - crithr = 0
siteyears_temp = finalist[0]
siteyears_precip = finalist[1]
siteyears_solrad = finalist[2]
print('temp:', len(siteyears_temp))
print('precip:', len(siteyears_precip))
print('solrad:', len(siteyears_solrad))

# identify overlapping siteyears
siteyears = list(set(siteyears_temp) & set(siteyears_precip))
siteyears = list(set(siteyears) & set(siteyears_solrad))
siteyears.sort()
siteyears_crithr0 = siteyears

print('overlapping siteyears:', len(siteyears))

temp: 2331
precip: 1854
solrad: 11937
overlapping siteyears: 1673


In [165]:
# assign output to individual siteyears - crithr = 1
siteyears_temp = finalist[0]
siteyears_precip = finalist[1]
siteyears_solrad = finalist[2]
print('temp:', len(siteyears_temp))
print('precip:', len(siteyears_precip))
print('solrad:', len(siteyears_solrad))

# identify overlapping siteyears
siteyears = list(set(siteyears_temp) & set(siteyears_precip))
siteyears = list(set(siteyears) & set(siteyears_solrad))
siteyears.sort()
siteyears_crithr1 = siteyears

print('overlapping siteyears:', len(siteyears))

temp: 4298
precip: 3154
solrad: 11937
overlapping siteyears: 3021


In [292]:
# assign output to individual siteyears - crithr = 2
siteyears_temp = finalist[0]
siteyears_precip = finalist[1]
siteyears_solrad = finalist[2]
print('temp:', len(siteyears_temp))
print('precip:', len(siteyears_precip))
print('solrad:', len(siteyears_solrad))

# identify overlapping siteyears
siteyears = list(set(siteyears_temp) & set(siteyears_precip))
siteyears = list(set(siteyears) & set(siteyears_solrad))
siteyears.sort()
siteyears_crithr2 = siteyears

print('overlapping siteyears:', len(siteyears))

temp: 6096
precip: 4397
solrad: 11937
overlapping siteyears: 4225


In [171]:
# assign output to individual siteyears - crithr = 3
siteyears_temp = finalist[0]
siteyears_precip = finalist[1]
siteyears_solrad = finalist[2]
print('temp:', len(siteyears_temp))
print('precip:', len(siteyears_precip))
print('solrad:', len(siteyears_solrad))

# identify overlapping siteyears
siteyears = list(set(siteyears_temp) & set(siteyears_precip))
siteyears = list(set(siteyears) & set(siteyears_solrad))
siteyears.sort()
siteyears_crithr3 = siteyears

print('overlapping siteyears:', len(siteyears))

temp: 6254
precip: 4564
solrad: 11937
overlapping siteyears: 4398


### Step 3. Store basic info of valid site-years
- Main input: **siteyears_crithr2**
    - crithr2 seems to be the best interval given the balance between gaining siteyears vs. limiting gap filling
- Main output: 
    - weadata/**siteyears_crithr2.csv** - site-year info for data filtered with crithr = 2
    - weadata/**site_nyears_crithr2.csv** - info on how many years of wea data each site has

In [8]:
siteyears = siteyears_crithr2 

# what are the valid site-years?
years = list()
sites = list()

for i in range(len(siteyears)):
    year = siteyears[i][0:4]
    years.append(year)
    site = siteyears[i][5:11] 
    sites.append(site)

df_siteyears = pd.DataFrame({'site': sites, 'year': years}, 
                            columns=['site', 'year'])
df_siteyears = df_siteyears.sort_values(['site', 'year'])
final_sites = list(set(df_siteyears.site))

# how many years of data do each site have?
site_nyears = list()

for i in final_sites:
    years = len(df_siteyears[df_siteyears["site"] == i])
    site_nyears.append(years)
    
df_site_nyears = pd.DataFrame({"site": final_sites, "years": site_nyears})
df_site_nyears = df_site_nyears.sort_values(["site"])
df_site_nyears = df_site_nyears.reset_index().iloc[:, 1:3]

# writing out info as .csv
#df_siteyears.to_csv('../weadata/siteyears_crithr2.csv')
#df_site_nyears.to_csv("../weadata/site_nyears_crithr2.csv")

### Step 4: Filter sites based on planting area & irrigation

#### 4.1 Select sites with valid weather data
- Main output: 
    - **df_sites_info**
    - **site_summary.csv**

In [7]:
# read in station & site-year info
df_site_nyears = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/site_nyears_crithr2.csv', 
                             index_col=0, dtype={'site': str})
df_stations_9110 = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/stations_info_9110.csv', 
                               dtype={'USAF': str}, usecols=[0,1,3,4,8,9,10])
df_sites_info = df_stations_9110[df_stations_9110.USAF.isin(df_site_nyears.site)]
df_sites_info.columns = ['site', 'class', 'station', 'state', 'tzone', 'lat', 'lon']

# merge site info & site-years info
df_sites_info = pd.merge(df_sites_info, df_site_nyears, on='site')

# drop stations from Alaska, Guam, Hawaii & Puerto Rico
df_sites_info = df_sites_info[(df_sites_info.state != 'AK') & (df_sites_info.state != 'GU') & 
                              (df_sites_info.state != 'HI')& (df_sites_info.state != 'PR')]

# final station list
df_sites_info.reset_index(inplace=True, drop=True)

#### 4.2 Include planting area & irrigation info
- Main output: 
    - df_obs: obs_areairri.csv
    - df_summary: site_summary.csv

- Census data of overall planting area & irrigation info

In [3]:
state = pd.read_csv('/home/disk/eos3/aswann/Shared/Data/irrigated_area/stateID.txt', header=None, sep='\s+')
county = pd.read_csv('/home/disk/eos3/aswann/Shared/Data/irrigated_area/countyID.txt', header=None, sep='\s+')
irri = pd.read_csv('/home/disk/eos3/aswann/Shared/Data/irrigated_area/irr_area_acres.txt', header=None, sep='\s+')
area = pd.read_csv('/home/disk/eos3/aswann/Shared/Data/irrigated_area/crop_area_acres.txt', header=None, sep='\s+')
state = state.iloc[0,:]
county = county.iloc[0,:]

# raw data includes data from 4 censuses that show data of 1997, 2002, 2007 & 2012
# we average data from all 4 censuses 
irri = irri.mean(axis=1)
area = area.mean(axis=1)
df_census = pd.DataFrame({'state': state, 'county': county, 'perct_irri': irri/area*100})

- NASS data of maize planting area & yield for individual years 1961-2005

In [4]:
# Read in state & county id
state_id = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/stateID.txt", sep="\s+", header=None)
state_id = state_id.transpose()
state_id.columns = ["state_id"]
county_id = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/countyID.txt", sep="\s+", header=None)
county_id = county_id.transpose()
county_id.columns = ["county_id"]

# Read in lat & lon information
lat_county = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/lat_county.txt", sep="\s+", header=None)
lat_county = lat_county.transpose()
lat_county.columns = ["lat"]
lon_county = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/lon_county.txt", sep="\s+", header=None)
lon_county = lon_county.transpose()
lon_county.columns = ["lon"]

# Read in maize yield
cornyield = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/corn_yield.txt", sep="\s+", header=None)
years = np.arange(1910, 2015)
cornyield.columns = years

# Reading in maize area
cornarea = pd.read_csv("/home/disk/eos8/ach315/data/NASS_cropdata/corn_area.txt", sep="\s+", header=None)
years = np.arange(1910, 2015)
cornarea.columns = years
cornarea.head()
cornarea = cornarea.melt(var_name='year', value_name='area')
cornarea = cornarea.drop(['year'], axis=1)

# concat all info and melt dataframe
df = pd.concat([state_id, county_id, lat_county, lon_county, cornyield], axis=1)
df = pd.melt(df, id_vars=['state_id', 'county_id', 'lat', 'lon'], value_name='yield', var_name="year")
df = pd.concat([df, cornarea], axis=1)

# subsetting data for year 1961-2005
df_nass = pd.DataFrame()
years = np.arange(1961,2006)
for i in range(len(years)):
    data = df[df['year'] == years[i]]
    df_nass = pd.concat([df_nass, data])

df_nass = df_nass.reset_index(drop=True)
df_nass.year = df_nass.year.astype(int)
df_nass.head()
#df_nass.to_csv('/home/disk/eos8/ach315/upscale/weadata/obs_nass.csv')

Unnamed: 0,state_id,county_id,lat,lon,year,yield,area
0,1.0,1.0,32.462991,-86.709691,1961,2.008561,6758.2607
1,1.0,3.0,30.787262,-87.712913,1961,2.761771,9024.5038
2,1.0,5.0,31.870087,-85.383129,1961,1.945793,11452.621
3,1.0,7.0,33.072877,-87.112698,1961,2.259631,2092.228
4,1.0,9.0,33.98835,-86.613622,1961,2.322398,10064.547


- Select maximum planting area for each county within the 1961-2005 time period
- Merge planting area info with census data of percentage irrigated

In [5]:
df_nass_group = df_nass.groupby(['state_id', 'county_id'])[['lat', 'lon', 'area']].max()
df_nass_group = df_nass_group.reset_index()
df_obs = df_nass_group.merge(df_census, how='left', 
                             left_on=['state_id','county_id'], 
                             right_on=['state','county'])
df_obs.head()
#df_obs.to_csv('/home/disk/eos8/ach315/upscale/weadata/obs_areairri.csv')

Unnamed: 0,state_id,county_id,lat,lon,area,state,county,perct_irri
0,1.0,1.0,32.462991,-86.709691,6758.2607,1.0,1.0,2.112183
1,1.0,3.0,30.787262,-87.712913,19748.69,1.0,3.0,8.51041
2,1.0,5.0,31.870087,-85.383129,11452.621,1.0,5.0,4.945534
3,1.0,7.0,33.072877,-87.112698,2092.228,1.0,7.0,0.624115
4,1.0,9.0,33.98835,-86.613622,10064.547,1.0,9.0,0.940536


- Finds the 5 nearest NASS observations sites next to each simulation site with weather station data
- Averages the planting area & irrigation percentage from all the nearest 5 locations
- Assigns averaged value to each simulation site

In [8]:
sites = df_sites_info.site
areas = []
perct_irris = []

for site in sites:
    lat = float(df_sites_info[df_sites_info.site == site].lat)
    lon = float(df_sites_info[df_sites_info.site == site].lon)
    dist = list(enumerate(np.sqrt((lat - df_obs.lat)**2 + (lon - (df_obs.lon))**2)))
    df_dist = pd.DataFrame(dist, columns=['rownum', 'distance'])
    row = list(df_dist.nsmallest(5, 'distance').rownum) # select the five nearest locations and average for
                                                        # cropping area & irrigation percentage
    area = df_obs.iloc[row].area.mean()
    perct_irri = df_obs.iloc[row].perct_irri.mean()
    areas.append(area)
    perct_irris.append(perct_irri)

# add planting area & irrigation info for filtering purposes
df_filter = pd.DataFrame({'area': areas, 'perct_irri': perct_irris})
df_summary = pd.concat([df_sites_info, df_filter], axis=1)
df_summary.head()
#df_summary.to_csv('/home/disk/eos8/ach315/upscale/weadata/site_summary.csv', index=False)

Unnamed: 0,site,class,station,state,tzone,lat,lon,years,area,perct_irri
0,722010,1,KEY WEST INTL ARPT,FL,-5,24.55,-81.75,25,101.17157,60.802507
1,722020,1,MIAMI INTL AP,FL,-5,25.817,-80.3,29,374.3348,61.901377
2,722030,1,WEST PALM BEACH INTL ARPT,FL,-5,26.683,-80.1,19,647.49803,78.435473
3,722050,1,ORLANDO INTL ARPT,FL,-5,28.433,-81.333,19,607.029392,55.098924
4,722056,1,DAYTONA BEACH INTL AP,FL,-5,29.183,-81.067,19,558.467041,50.088347


#### 4.3 Filter out sites with low planting and/or high irrigation
- Main output: **siteyears_filtered**

The new filtered sites are now lower

In [79]:
# filter
df_filtered = df_summary[(df_summary.area > 1000) & (df_summary.perct_irri < 50)] 

# how many site-years left?
df_siteyears = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/siteyears_crithr2.csv', dtype='str', usecols=[1,2])
siteyears_filtered = df_siteyears[df_siteyears.site.isin(df_filtered.site)]
print('prior:', df_siteyears.shape[0])
print('filtered:', siteyears_filtered.shape[0]) 

prior: 4225
filtered: 2603


### Step 5: Compile and gap-fill usable site-years data into individual weather data files
- Main tasks:
    - figure out time zone for individual site and convet wea data from UTC into local time
    - gap-fill wea data by linearly interpolating with data from hour before and after
- Main input:
    - /weadata/**stations_info.csv** - city, state, lat, lon info for each site
    - /weadata/**siteyears_crithr2.csv** - site-year info for data filtered with crithr = 1
    - df_temp, df_rh, df_precip, df_solrad
- Main output:
    - /weadata/data/control/**site_year.txt** - MAIZSIM weather file for every site-year
- Functions:
    - find_zone(site)
    - utc_to_local(times, zone)

#### 5.1 Read in weather file & site-year info

In [9]:
# weather data
df_temp = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/temp_all.csv', index_col=0)
df_rh = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/rh_all.csv', index_col=0)
df_precip = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/precip_all.csv', index_col=0)
df_solrad = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/solrad_all.csv', index_col=0)

# site-year & filter info
df_siteyears = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/siteyears_crithr2.csv', dtype='str', usecols=[1,2])
df_summary = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/site_summary.csv', dtype={'site':str}, index_col=0)
df_filtered = df_summary[(df_summary.area > 1000) & (df_summary.perct_irri < 50)] 
siteyears = df_siteyears[df_siteyears.site.isin(df_filtered.site)]

#### 5.2 Create weather file for individual site-years

In [13]:
%%time

# set up growing season period
season_start, season_end = '03-02', '11-30'

# create individual site-year weather data
for i in np.arange(siteyears.shape[0]):
    # selecting site-year combinations
    site = siteyears.iloc[i,0]
    year = siteyears.iloc[i,1]
#    print(site, year)
    
    # constructing dataframe that will hold all weather data
    col = ['jday','date','hour','solrad','temp','precip','rh', 'co2']
    df_wea = pd.DataFrame(columns=col)

    # setting up for time-relating entries
    times = pd.date_range(season_start + '-' + str(year), 
                          season_end + '-' + str(year)+ ' 23:00:00', freq='1H') # utc time
    zone = find_zone(site, df_summary)
    local_datetime = utc_to_local(times, zone)

    # selecting weather data
    utc_start, utc_end = str(times[0]), str(times[-1])
    df_wea.temp = list(df_temp[utc_start:utc_end][site])
    df_wea.rh = list(np.round((df_rh[utc_start:utc_end][site]), 2))
    df_wea.precip = list(df_precip[utc_start:utc_end][site])
    df_wea.co2 = 400    

    # selecting solar radiation 
    t1 = pd.to_datetime(utc_start).to_pydatetime()
    t2 = pd.to_datetime(utc_end).to_pydatetime()
    tdiff = t2-t1
    local_start = str(local_datetime[0])[:19] 
    local_end = str(pd.to_datetime(local_start).to_pydatetime() + tdiff)[:19]
    df_wea.solrad = list(df_solrad[local_start:local_end][site]) ###*** issue here
    
    # adding time-relating info to data frame
    local = pd.date_range(local_start, local_end, freq='H')
    df_wea.jday = local.dayofyear
    df_wea.date = local.strftime("'%m/%d/%Y'")
    df_wea.hour = local.hour    
    
    # gap-filling weather data
    if df_wea.isna().sum().sum() > 0:
        # creating a log file that documents the number of missing data for each site-year
        f = open('/home/disk/eos8/ach315/upscale/weadata/data/log.txt', 'a+')
        f.write(siteyears.iloc[i,:][0]) # site
        f.write(', %s' %siteyears.iloc[i,:][1]) # year
        f.write(', %s' %df_wea.isna().sum().temp) # temp
        f.write(', %s' %df_wea.isna().sum().rh) # rh
        f.write(', %s' %df_wea.isna().sum().precip) # precip
        f.write(', %s\r\n' %df_wea.isna().sum().solrad) # solrad
        f.close()
        
        # gap-filling data by linearly interpolating with data from hour before and after
        df_wea = df_wea.interpolate() 
            
    # saving individual site-year weather file into .csv 
    df_wea.to_csv('/home/disk/eos8/ach315/upscale/weadata/data/control/' + site + '_' + year + '.txt', sep='\t', index=False)

CPU times: user 1h 1min 9s, sys: 29.5 s, total: 1h 1min 38s
Wall time: 1h 1min 6s


### Step 6: Final step of gap-filling if needed

Since pd.interpolate() cannot gap-fill missing data if the missing data is located at the very beginning of the data (nan in first row), the code checks whether there are site-years with that situation, and if so assigns the missing data in the first row a default number.

In [14]:
fnames = glob.glob('/home/disk/eos8/ach315/upscale/weadata/data/contrl/*')

for name in fnames: 
    df_wea = pd.read_csv(name)
    df_wea = df_wea.drop(df_wea.columns[0], axis=1)
    if df_wea.isna().sum().sum() > 0:
        print(name.split('/')[-1], df_wea.isna().sum().sum())

# no files required additional gap-filling

### Step 7: Read in a final compiled weather file to check output

In [17]:
test = pd.read_csv('/home/disk/eos8/ach315/upscale/weadata/data/control/725430_1993.txt', sep='\t')
test.head()

Unnamed: 0,jday,date,hour,solrad,temp,precip,rh,co2
0,60,'03/01/1993',18,0.0,2.8,0.0,81.75,400
1,60,'03/01/1993',19,0.0,1.7,0.0,81.62,400
2,60,'03/01/1993',20,0.0,0.6,0.0,91.65,400
3,60,'03/01/1993',21,0.0,0.0,0.0,100.0,400
4,60,'03/01/1993',22,0.0,0.0,0.0,100.0,400


### Step 8: Compile and summarize weather data for individual site-year
- Convert dataframe-structured weather data into single long-form list
- Select only weather data from filtered site-years
- Summarize data only for growing season 4/1-10/31

In [29]:
temp_all = [np.nan]*siteyears.shape[0]
rh_all = [np.nan]*siteyears.shape[0]
precip_all = [np.nan]*siteyears.shape[0]
solrad_all = [np.nan]*siteyears.shape[0]

for i in np.arange(siteyears.shape[0]):
    # growing season temp mean for each site-year
    temp = df_temp.loc[siteyears.iloc[i,1] + '-04-01' : 
                       siteyears.iloc[i,1] +'-10-31', 
                       siteyears.iloc[i,0]].mean()
    # growing season RH mean for each site-year
    rh = df_rh.loc[siteyears.iloc[i,1] + '-04-01' : 
                   siteyears.iloc[i,1] +'-10-31', 
                   siteyears.iloc[i,0]].mean()
    # growing season precip sum for each site-year
    precip = df_precip.loc[siteyears.iloc[i,1] + '-04-01' : 
                           siteyears.iloc[i,1] +'-10-31', 
                           siteyears.iloc[i,0]].sum()

    # 
    solrad = df_solrad.loc[siteyears.iloc[i,1] + '-04-01' : 
                           siteyears.iloc[i,1] +'-10-31', 
                           siteyears.iloc[i,0]].mean()
    
    
    temp_all[i] = temp
    rh_all[i] = rh
    precip_all[i] = precip
    solrad_all[i] = solrad
    
# calculating VPD based on temperature & RH
vpd_all = []
for i in np.arange(len(temp_all)):
    vpd_all.append(CC_VPD(temp_all[i], rh_all[i]/100))
    
# storing output in dataframe
df_siteyears_weamean = siteyears.copy()
df_siteyears_weamean['temp'] = list(temp_all)
df_siteyears_weamean['rh'] = list(rh_all)
df_siteyears_weamean['precip'] = list(precip_all)
df_siteyears_weamean['solrad'] = list(solrad_all)
df_siteyears_weamean['vpd'] = list(vpd_all)
df_siteyears_weamean = pd.merge(df_siteyears_weamean, df_filtered, on='site')

df_siteyears_weamean.head()
#df_siteyears_weamean.to_csv('/home/disk/eos8/ach315/upscale/weadata/wea_summary.csv')

Unnamed: 0,site,year,temp,rh,precip,solrad,vpd,class,station,state,tzone,lat,lon,years,area,perct_irri
0,722070,1961,22.527191,74.713304,881.8,234.060642,0.701216,1,SAVANNAH INTL AP,GA,-5,32.117,-81.2,22,2445.923794,6.585904
1,722070,1962,23.289261,75.118104,973.0,233.714984,0.723302,1,SAVANNAH INTL AP,GA,-5,32.117,-81.2,22,2445.923794,6.585904
2,722070,1965,23.004871,75.05625,226.6,229.640454,0.712474,1,SAVANNAH INTL AP,GA,-5,32.117,-81.2,22,2445.923794,6.585904
3,722070,1970,23.873122,72.770417,275.7,228.436228,0.820531,1,SAVANNAH INTL AP,GA,-5,32.117,-81.2,22,2445.923794,6.585904
4,722070,1971,23.547535,72.725135,564.4,227.59644,0.805592,1,SAVANNAH INTL AP,GA,-5,32.117,-81.2,22,2445.923794,6.585904
