In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import xarray as xr

from dask.distributed import Client

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split

from joblib import dump

from fluxnet_etl import load_fluxnet, make_lookback

In [5]:
client = Client(n_workers=12)
client

0,1
Client  Scheduler: tcp://127.0.0.1:34653  Dashboard: proxy/8787/status,Cluster  Workers: 12  Cores: 12  Memory: 42.95 GB


In [6]:
x_data_computed, y_data_computed, meta = load_fluxnet(compute=True)

In [8]:
def fit_x_transformers(dfs):
    df = pd.concat(dfs).reset_index(drop=True)
    ct = ColumnTransformer(
        transformers=[
            ("P", FunctionTransformer(np.cbrt, validate=False), ['P']),
            ("t_min", StandardScaler(), ['t_min']),
            ("t_max", StandardScaler(), ['t_max']),
            ("t", 'passthrough', ['t']),
            ("lat", 'passthrough', ['lat']),
            ("elev", StandardScaler(), ['elev'])
        ],
    )
    ct.fit(df)
    return ct


def fit_y_transformers(dfs):
    df = pd.concat(dfs).reset_index(drop=True)
    trans = MinMaxScaler()
    trans.fit(df)
    return trans

In [15]:
def split(x_dfs, y_dfs, test_size=365):

    x_train = []
    x_val = []
    y_train = []
    y_val = []

    for x, y in zip(x_dfs, y_dfs):
        xt, xv, yt, yv = train_test_split(x, y,
                                          test_size=test_size,
                                          shuffle=False)
        x_train.append(xt)
        y_train.append(yt)
        x_val.append(xv)
        y_val.append(yv)
    
    return x_train, x_val, y_train, y_val


x_train, x_val, y_train, y_val = split(x_data_computed, y_data_computed)

# save x data
lookback = 90
x_trans = fit_x_transformers(x_data_computed)
dump(x_trans, './etl_data/x_trans.joblib')
for name, df_list in zip(['x_train', 'x_val'], [x_train, x_val]):
    features = df_list[0].columns
    da = xr.concat([make_lookback(x_trans.transform(df), features, lookback=lookback)
                    for df in df_list],
                   dim='samples')
    da.name = name
    print(name, da.shape)
    da.to_netcdf(f'./etl_data/{name}.nc')
    

# save y data
y_trans = fit_y_transformers(y_data_computed)
dump(y_trans, './etl_data/y_trans.joblib')
for name, df_list in zip(['y_train', 'y_val'], [y_train, y_val]):
    da = xr.concat([xr.DataArray(y_trans.transform(df[lookback:]),
                                 dims=('sample', 'feature'))
                    for df in df_list], dim='sample')
    da.name = name
    print(name, da.shape)   
    da.to_netcdf(f'./etl_data/{name}.nc')

x_train (464187, 90, 6)
x_val (55000, 90, 6)
y_train (464187, 1)
y_val (55000, 1)
