# Here we will go through some requested problems.

## Data grouping in Pandas. (e.g. how to simply calculate and plot weekly averages of time series data.)

In [1]:
import pandas as pd  # Convention suggest to import as pd
from pathlib import Path  # Importing a sub-module of the pathlib library

In [2]:
filename = Path('..', 'data', 'sgpmetE13.b1', 'sgpmetE13.00.20191105.150801.raw.dat')
df = pd.read_csv(filename, delimiter=',', header=0, skiprows=[0, 2, 3], parse_dates=[0], index_col=[0])
df

Unnamed: 0_level_0,RECORD,batt_volt,PTemp,Pressure_kPa,Temp_C_Avg,Temp_C_Std,RH_Avg,RH_Std,Vap_Pressure_kPa_Avg,Vap_Pressure_kPa_Std,...,PWDa_Code_1hour,PWDa_H2O_Int_Avg_1min,PWDa_Total_H2O_mm,PWDa_Total_Snow_mm,PCP_Rate,WS_Slope,WS_Offset,TBRG_SN,RainCoefA,RainCoefB
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-05 13:00:00,303333,11.73,6.048,98.78,2.24,0.029,87.6,0.318,0.629,0.002,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:01:00,303334,11.72,6.048,98.78,2.299,0.017,87.5,0.144,0.631,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:02:00,303335,12.02,6.048,98.78,2.309,0.016,87.0,0.193,0.628,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:03:00,303336,11.7,6.048,98.78,2.331,0.014,87.1,0.157,0.629,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:04:00,303337,11.71,6.048,98.78,2.334,0.014,86.9,0.11,0.628,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:05:00,303338,11.73,6.048,98.78,2.346,0.012,86.5,0.236,0.625,0.002,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:06:00,303339,11.72,6.048,98.78,2.325,0.017,86.4,0.135,0.624,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:07:00,303340,11.71,6.048,98.78,2.248,0.028,86.2,0.213,0.619,0.002,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:08:00,303341,11.7,6.048,98.78,2.196,0.014,86.5,0.171,0.619,0.001,...,0,0,62.31,14,0,0.098,0,118,0,1.024
2019-11-05 13:09:00,303342,11.73,6.048,98.78,2.146,0.029,86.7,0.203,0.618,0.002,...,0,0,62.31,14,0,0.098,0,118,0,1.024


We can use the resample() method with a duration in string format and the mean() method to indicate how to perform the averaging. If this raises an error may be because the index of the Dataframe is just a number, not the timestamp read from the file. We indicated that the index of the Dataframe is TIMESTAMP with the index_col=[0] keyword. If you do not do this upon reading the data, set it after the data is read in with df = df.set_index('TIMESTAMP').

In [3]:
temperature = df['Temp_C_Avg']  # Extract one column as a Pandas Series
temperature = temperature.resample('10min').mean()  # Resample to 10 minute avergae blocks
temperature

TIMESTAMP
2019-11-05 13:00:00    2.2774
2019-11-05 13:10:00    2.3260
2019-11-05 13:20:00    3.0298
2019-11-05 13:30:00    3.3490
2019-11-05 13:40:00    3.5721
2019-11-05 13:50:00    3.9559
Freq: 10T, Name: Temp_C_Avg, dtype: float64

We can also perform the resampling on the entier Dataframe with one call.

In [4]:
df = df.resample('10min').mean()
df

Unnamed: 0_level_0,RECORD,batt_volt,PTemp,Pressure_kPa,Temp_C_Avg,Temp_C_Std,RH_Avg,RH_Std,Vap_Pressure_kPa_Avg,Vap_Pressure_kPa_Std,...,PWDa_Code_1hour,PWDa_H2O_Int_Avg_1min,PWDa_Total_H2O_mm,PWDa_Total_Snow_mm,PCP_Rate,WS_Slope,WS_Offset,TBRG_SN,RainCoefA,RainCoefB
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-05 13:00:00,303337.5,11.747,6.048,98.78,2.2774,0.019,86.84,0.188,0.625,0.0014,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024
2019-11-05 13:10:00,303347.5,11.747,6.0801,98.78,2.326,0.0255,86.91,0.1578,0.6275,0.0013,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024
2019-11-05 13:20:00,303357.5,11.799,6.3276,98.792,3.0298,0.0249,84.73,0.1608,0.6431,0.0011,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024
2019-11-05 13:30:00,303367.5,11.834,6.7329,98.805,3.349,0.0163,83.31,0.2114,0.6471,0.002,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024
2019-11-05 13:40:00,303377.5,11.86,7.2724,98.835,3.5721,0.0158,83.75,0.2158,0.6606,0.002,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024
2019-11-05 13:50:00,303387.5,11.867,7.8961,98.855,3.9559,0.0203,83.24,0.1789,0.6746,0.0015,...,0.0,0.0,62.31,14.0,0.0,0.098,0.0,118.0,0.0,1.024


## Aligning time stamps from separate Dataframes
There are many way to accomplis this. This example will focus on using Pandas and specifically the merge method to put both data into one Dataframe after the data is merged.

First we will read in data from the same day of two different datastreams. Working with netCDF data files is better with Xarray but we can easily read the data with Xarray and convert to Pandas. This means we can still use the Space we are most comfortable working with. Notice we do not need to set the time as the index as we did above.

In [5]:
import xarray as xr

filename = Path('..', 'data', 'sgpmetE13.b1', 'sgpmetE13.b1.20191101.000000.cdf')
ds_met = xr.open_dataset(filename)

# This part is just removing some variables to make viewing easier.
keep_variables = ['atmos_pressure', 'temp_mean', 'rh_mean', 'time']
for var in list(ds_met.data_vars):
    if var not in keep_variables:
        del ds_met[var]

filename = Path('..', 'data', 'sgpecorsfE14.b1', 'sgpecorsfE14.b1.20191101.000000.nc')
ds_ecor = xr.open_dataset(filename)

# This part is just removing some variables to make viewing easier.
keep_variables = ['air_temperature', 'air_pressure', 'relative_humidity', 'time']
for var in list(ds_ecor.data_vars):
    if var not in keep_variables:
        del ds_ecor[var]

Notice that the time steps do not match exactly. The MET is one minute and ECOR is 30 minute time steps.

In [6]:
print('MET:', ds_met['time'].shape, ds_met['time'].values[:2])
print('ECOR:', ds_ecor['time'].shape, ds_ecor['time'].values[:2])

MET: (1440,) ['2019-11-01T00:00:00.000000000' '2019-11-01T00:01:00.000000000']
ECOR: (48,) ['2019-11-01T00:00:00.000000000' '2019-11-01T00:30:00.000000000']


Since we desire to work in the Pandas Space we can convert the Xarray Dataset to Pandas Dataframe.

In [7]:
df_met = ds_met.to_dataframe()
df_ecor = ds_ecor.to_dataframe()

Now we can use the merge() method to merge the two Dataframes into one. We need to indicate what column we want to use as indexing with on="time" and which Dataframe is the timestep to match to using how="left". Notice this indicates the left Dataframe (df_ecor) time step is our desire. 

In [8]:
df_merged = pd.merge(df_ecor, df_met, how='left', on='time')
print(df_merged.columns)
print(df_merged.shape)

Index(['air_temperature', 'air_pressure', 'relative_humidity',
       'atmos_pressure', 'temp_mean', 'rh_mean'],
      dtype='object')
(48, 6)


And just to show that the values are actually subsampled we can perform a difference. Notice that the difference hovers around the differnce between Kelvin and Degrees Celcius.

In [9]:
difference = df_merged['air_temperature'] - df_merged['temp_mean']
difference[:5]

time
2019-11-01 00:00:00    273.815002
2019-11-01 00:30:00    273.296997
2019-11-01 01:00:00    273.117981
2019-11-01 01:30:00    273.539001
2019-11-01 02:00:00    273.498016
dtype: float32

## Align along time but keep separate
But what if we don't want to put both data into the same object. We can use align() method to modify the two data frames and align them on the same time grid. We need to provide axis=0 to ensure it aligns on the time axis only and it does not try to align on varible names too.

In [10]:
met_align, ecor_align = df_met.align(df_ecor, join='right', axis=0)
print('MET:', met_align.shape)
print('ECOR:', ecor_align.shape)

MET: (48, 3)
ECOR: (48, 3)


In [11]:
difference = ecor_align['air_temperature'] - met_align['temp_mean']
difference[:5]

time
2019-11-01 00:00:00    273.815002
2019-11-01 00:30:00    273.296997
2019-11-01 01:00:00    273.117981
2019-11-01 01:30:00    273.539001
2019-11-01 02:00:00    273.498016
dtype: float32

## We can do the same actions in the Xarray Space using the Xarray Datasets

In [12]:
ds_merged = xr.merge([ds_ecor, ds_met], join='left', combine_attrs='override')
ds_merged

Or we can return two Xarray Datasets with the time steps converted to match.

In [13]:
ds_ecor_align, ds_met_align = xr.align(ds_ecor, ds_met, join='left')
print('ECOR:', ds_ecor_align['time'].shape)
print('MET:', ds_met_align['time'].shape)

ECOR: (48,)
MET: (48,)
