# ETL
*Extract-transform-load*

This notebook does the data engineering steps required for the Met-ML training and evaluation:

- load fluxnet csvs
- fit transformers on the full dataset
- saves the preprocessed data and transformers for use in the next steps of the project

In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import xarray as xr

from dask.distributed import Client

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split

from joblib import load, dump

from fluxnet_etl import load_fluxnet, make_lookback

In [2]:
# load the fluxnet dataset
reload = False
if reload:
    # use dask to speed things up
    client = Client(n_workers=25)
    x_data_computed, y_data_computed, meta = load_fluxnet(compute=True)
    
    dump(x_data_computed, './etl_data/x_data_computed.joblib')
    dump(y_data_computed, './etl_data/y_data_computed.joblib')
    dump(meta, './etl_data/meta.joblib')
else:
    x_data_computed = load('./etl_data/x_data_computed.joblib')
    y_data_computed = load('./etl_data/y_data_computed.joblib')
    meta = load('./etl_data/meta.joblib')

In [3]:
def fit_x_transformers(dfs):
    '''takes a list of dataframes, returns a fit transformer'''
    # concat all the dataframes together
    df = pd.concat(dfs).reset_index(drop=True)
    ct = ColumnTransformer(
        transformers=[
            ("P", FunctionTransformer(np.cbrt, validate=False), ['P']),
            ("t_min", StandardScaler(), ['t_min']),
            ("t_max", StandardScaler(), ['t_max']),
            ("t", 'passthrough', ['t']),  # already between 0 and 1
            ("lat", 'passthrough', ['lat']),  # already between 0 and 1
            ("elev", StandardScaler(), ['elev'])  # maybe this should be a MinMaxScaler
        ],
    )
    ct.fit(df)
    return ct


def fit_y_transformers(dfs):
    '''takes a list of dataframes, returns a fit transformer'''
    # concat all the dataframes together
    df = pd.concat(dfs).reset_index(drop=True)
    trans = MinMaxScaler()  # scale between 0 and 1
    trans.fit(df)
    return trans

In [4]:
def split(x_dfs, y_dfs, test_size=365):
    '''split the fluxnet dataset into training and test groups
    
    This currently just keeps the last N (default=365) days for test samples
    
    In the future, this should support splitting out complete stations
    '''

    x_train = []
    x_val = []
    y_train = []
    y_val = []

    for x, y in zip(x_dfs, y_dfs):
        xt, xv, yt, yv = train_test_split(x, y,
                                          test_size=test_size,
                                          shuffle=False)
        x_train.append(xt)
        y_train.append(yt)
        x_val.append(xv)
        y_val.append(yv)
    
    return x_train, x_val, y_train, y_val


# split the data into train/val groups
x_train, x_val, y_train, y_val = split(x_data_computed, y_data_computed)

# save x data
lookback = 90

# fit the x-transformer
x_trans = fit_x_transformers(x_data_computed)
dump(x_trans, './etl_data/x_trans.joblib')  # save for later

# create the 3D tensor for the LSTM including a lookback dimension
for name, df_list in zip(['x_train', 'x_val'], [x_train, x_val]):
    features = df_list[0].columns
    da = xr.concat([make_lookback(x_trans.transform(df), features, lookback=lookback)
                    for df in df_list],
                   dim='samples')
    da.name = name
    print(name, da.shape)
    da.to_netcdf(f'./etl_data/{name}.nc')  # save for later
    

# save y data
# fit the y-transformer
y_trans = fit_y_transformers(y_data_computed)
dump(y_trans, './etl_data/y_trans.joblib')  # save for later

# transform the targets and save all samples (minus the lookback) for later
for name, df_list in zip(['y_train', 'y_val'], [y_train, y_val]):
    da = xr.concat([xr.DataArray(y_trans.transform(df[lookback:]),
                                 dims=('sample', 'feature'))
                    for df in df_list], dim='sample')
    da.name = name
    print(name, da.shape)   
    da.to_netcdf(f'./etl_data/{name}.nc')  # save for later

x_train (464187, 90, 6)
x_val (55000, 90, 6)
y_train (464187, 1)
y_val (55000, 1)
