In [None]:
import glob
from pathlib import Path

import pandas as pd
import xarray as xr

from blooms_ml.utils import (
    sample_stations_sparse,
    get_from_dia,
    get_from_avg,
)

In [None]:
files_dia = sorted(glob.glob(
    f"{Path.home()}/fram_shmiak/ROHO800_hindcast_2007_2019_v2bu/roho800_v2bu_dia/*dia*.nc"
))[:20]
files_avg = sorted(glob.glob(
    f"{Path.home()}/fram_shmiak/ROHO800_hindcast_2007_2019_v2bu/roho800_v2bu_avg/*avg*.nc"
))[:20]

In [None]:
stations, st_labels, xis, etas = sample_stations_sparse(xr.open_dataset(files_dia[0]))

In [None]:
ds_dia = xr.open_mfdataset(files_dia)
ds_avg = xr.open_mfdataset(files_avg)

In [None]:
ddf_dia = get_from_dia(ds_dia, xis, etas)
df_dia_orig = ddf_dia.compute()

In [None]:
ddf = get_from_avg(ds_avg, xis, etas)
df_orig = ddf.compute()

Reset index and re-multiindex since if it is in the wrong order pandas will silently mess up everything.

In [None]:
df_dia = df_dia_orig.reset_index().drop('index', axis=1).set_index(['station', 'ocean_time', 's_rho'])
df = df_orig.reset_index().drop('index', axis=1).set_index(['station', 'ocean_time', 's_rho'])

In [None]:
df['light_PAR0'] = df_dia['light_PAR0']
df['P1_netPI'] = df_dia['P1_netPI']

In [None]:
df_station = df_dia.reset_index()[df_dia.reset_index()['station']==0][df_dia.reset_index()['s_rho']==-0.02]
df_station.set_index('ocean_time', inplace=True)
df_station['light_PAR0'].plot(figsize=(14, 7))

In [None]:
df_station = df.reset_index()[df.reset_index()['station'] == 0][df.reset_index()['s_rho'] == -0.02]
df_station.set_index('ocean_time', inplace=True)
df_station['light_PAR0'].plot(figsize=(14, 7))

In [None]:
def normalize_series(row: pd.Series):
    return ((row - row.mean()) / row.std()).round(2).astype('float32')

def append_rho_profiles(df_station, nlayers: int = 25):
    df_station = df_station.reset_index(drop=True)
    rho = df_station.pivot(index='ocean_time', columns='s_rho', values='rho')
    new_columns = [str(i) for i in range(1, len(rho.columns)+1)]
    rho.rename(columns=dict(zip(rho.columns[:], new_columns)), inplace=True)
    rho = rho.apply(normalize_series, axis=1)
    rho = rho.loc[rho.index.repeat(nlayers)]
    rho = rho.rename_axis(None, axis=1)
    rho = rho.reset_index()
    return pd.concat([df_station, rho.iloc[:, 1:]], axis=1)

In [None]:
df = df.reset_index().groupby('station').apply(append_rho_profiles)

In [None]:
df = df[df['s_rho'] > -0.3]  # surface
df = df.reset_index(drop=True)
df.iloc[:, 3:11] = df.iloc[:, 3:11].apply(normalize_series, axis=0)

In [None]:
df

In [None]:
df.to_parquet(f"{Path.home()}/data_ROHO/all-stations-norm.parquet")