# Creating an xarray Dataset of ground observations
- Addressing issues Nic and I have encountered while trying to create xarray.Datasets of ground observations

In [1]:
## Import statements
import numpy as np
import xarray as xr
import pandas as pd
from netCDF4 import Dataset
from netCDF4 import num2date, date2num
from datetime import datetime, timedelta
import pytz

# OS interaction
import sys
import os

In [5]:
## Directory Lists
dirSIO = '/Users/karllapo/gdrive/SnowHydrology/proj/CloudClimatology/data/GroundObs/YOS.SIO.Obs'
dirOut = '/Users/karllapo/gdrive/SnowHydrology/proj/CloudClimatology/data/GroundObs'


In [6]:
###########################################
## Ground Obs: SIO
# Empty lists and dictionary literals
stations = []
lat = []
lon = []
elev = []
network = []
grobs = {}
count = 0

# time zone variables
tz_pst = pytz.timezone('US/Pacific')

#########################
##### READ SIO DATA #####
#########################
os.chdir(dirSIO)
content = os.listdir(os.getcwd())
num_files = len([name for name in os.listdir('.') if os.path.isfile(name)])
na_value = ['   NaN']

# Read supporting station information
os.chdir(dirOut)
stdat = pd.read_csv('All_StationSummary.v2.csv',sep= ',', \
                index_col=0,na_values=[-9999,'NaN']) # Read the supporting information
stdat = stdat.groupby('Network').get_group('CDWR')

for files in content:
    # Only read QC formatted files
    if files[-10:] == 'Rad.QC.txt':
        count = count + 1
        if count > 2:
            break
        
        # What file are we reading?
        sitename = files.split('.')[0]
        stations.append(sitename)

        # Read SW data, asign to PST, and get SW that passes QC
        grobs_yos = pd.read_csv(files,sep= '\t', parse_dates=True, index_col=0, na_values=na_value)
        grobs_yos.index = grobs_yos.index#.tz_localize(pytz.utc).tz_convert(tz_pst)
        grobs_yos['SWdwn_QC'] = grobs_yos['SWdwn_Wm^-2'].where(grobs_yos['QCFlag'] == 0)

        # New data frame w/ daily means
        grobs_yos_daily = grobs_yos['SWdwn_Wm^-2'].resample('D', how='mean').to_frame(name='SWdwn_D')
        grobs_yos_daily['SWdwn_D_QC'] = grobs_yos['SWdwn_QC'].resample('D', how='mean')
        grobs_yos_daily['SWdwn_D_proc'] = grobs_yos['SWdwn_proc'].resample('D', how='mean')

        # List containing DataFrames with daily, processed only
        grobs[sitename] = pd.DataFrame(grobs_yos_daily['SWdwn_D_proc'])
        grobs[sitename].columns = ['SWdwn']
        grobs[sitename].index = grobs_yos_daily.index

        # Fill in elevation/lat/lon/network
        elev.append(stdat.loc[sitename]['elevation (m)'])
        lat.append(stdat.loc[sitename]['lat'])
        lon.append(stdat.loc[sitename]['lon'])
        network.append('CDWR')

        print(("Processed Site: "+sitename))

Processed Site: bee
Processed Site: dan


the new syntax is .resample(...).mean()
the new syntax is .resample(...).mean()
the new syntax is .resample(...).mean()


In [7]:
# convert each site from a pandas Dataframe to an xarray Dataset
grobsXR = {}
for site in grobs:
    grobsXR[site] = xr.Dataset.from_dataframe(grobs[site])
    
# align each Dataset - this approach won't work for list comprehension (align does not expect lists.
[grobsXR['dan'],grobsXR['bee']] = xr.align(grobsXR['dan'],grobsXR['bee'],join='outer')
grobsXR = xr.concat([grobsXR['dan'],grobsXR['bee']],dim='station')

print(grobsXR)

<xarray.Dataset>
Dimensions:   (datetime: 2972, station: 2)
Coordinates:
  * datetime  (datetime) datetime64[ns] 2004-08-27 2004-08-28 2004-08-29 ...
  * station   (station) int64 0 1
Data variables:
    SWdwn     (station, datetime) float64 808.2 264.9 239.8 254.9 284.9 ...


In [8]:
########################
##### Combine data using pandas Dataframes -- this approach is robust for ragged station data
# Concatenate 
grobs_daily = pd.concat(grobs,axis=0,keys=stations)
grobs_daily = pd.DataFrame(grobs_daily)

# Convert to xray
ds = xr.Dataset.from_dataframe(grobs_daily)
ds = ds.rename({'level_0':'station','datetime':'time'})

# Fill in descriptive variables
ds.coords['lat'] = ('station',lat)
ds.coords['lon'] = ('station',lon)
ds.coords['elev'] = ('station',elev)
ds.coords['network'] = ('station',network)

print(ds)

<xarray.Dataset>
Dimensions:  (station: 2, time: 2972)
Coordinates:
  * station  (station) object 'bee' 'dan'
  * time     (time) datetime64[ns] 2004-08-27 2004-08-28 2004-08-29 ...
    lat      (station) float64 37.53 37.9
    lon      (station) float64 118.3 119.3
    elev     (station) float64 2.768e+03 2.987e+03
    network  (station) |S4 'CDWR' 'CDWR'
Data variables:
    SWdwn    (station, time) float64 nan nan nan nan nan nan nan nan nan nan ...
