# Estimate baseline skill for LR & RF using the microphysics datasest
Improvement from the first scripts using code from NN_Emu_Example.ipynb



In [1]:
# path to local fv3net emulation directory

import sys
sys.path.append("/home/jmnugent/fv3net/workflows/emulation/")

import intake
import time
import os
import yaml
import tempfile

import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import train_emulator as te

from matplotlib import colors
from cartopy import crs as ccrs
from fv3fit._shared.packer import ArrayPacker
from fv3viz import pcolormesh_cube, plot_cube_axes, mappable_var

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from loaders.mappers import open_phys_emu_training
from loaders.batches import batches_from_mapper


 The versions of TensorFlow you are currently using is 2.4.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
train_data_path = "/mnt/disks/scratch/training/training-subsets/simple-phys-hybridedmf-w-microphysics-12day/"
test_data_path = "/mnt/disks/scratch/testing/validation-subsets/simple-phys-hybridedmf-w-microphysics-12day/"

train_batches = te.get_subsampled_batches(train_data_path)
test_batches = te.get_subsampled_batches(test_data_path)


In [3]:
# multi index is dropped before netcdf save, so we have to add back in
# to get full spatial field
def _get_multi_idx(ds):
    coords_to_unstack = ["tile", "x", "y"]
    coords = [ds.coords[key].values for key in coords_to_unstack]
    multi_idx = pd.MultiIndex.from_tuples(zip(*coords), names=coords_to_unstack)
    
    ds = ds.reset_coords(coords_to_unstack, drop=True)
    return ds.assign_coords({"sample": multi_idx})


### Subselect variables, stack, and concatenate

##### Pick variables to include in the concatenated dataset

In [4]:
# list of the variables you want
input_vars = ["eastward_wind",
              "northward_wind", 
              "vertical_wind",
              "air_temperature",
              "pressure_thickness_of_atmospheric_layer",
              "specific_humidity",
              "cloud_water_mixing_ratio",
              ]

# list of the physics tendencies
all_phys_vars = ["tendency_of_air_temperature_due_to_fv3_physics",
                 "tendency_of_specific_humidity_due_to_fv3_physics",
                 "tendency_of_eastward_wind_due_to_fv3_physics",
                 "tendency_of_northward_wind_due_to_fv3_physics",
                 "tendency_of_cloud_water_mixing_ratio_due_to_fv3_physics",
                 "tendency_of_pressure_thickness_of_atmospheric_layer_due_to_fv3_physics",
                 ]

# list of the microphysics tendencies
micro_vars = ["tendency_of_specific_humidity_due_to_microphysics",
              "tendency_of_air_temperature_due_to_microphysics",
              ]

# shortened versions of the data variable names
short_names = ["u", "v", "w", "T", "dP", "qv", "qc",
               "T_phys", "qv_phys", "u_phys", "v_phys", "qc_phys", "dP_phys",
               "qv_micro", "T_micro"
              ]


In [5]:
# (technically not ALL)
all_vars = input_vars + all_phys_vars + micro_vars

# map the full name to the short name
shortname_dict = dict(zip(short_names, all_vars))

# pick variables
# using the dictionary just requires less typing
vars_to_include = [shortname_dict.get(key) for key in ["T", "dP", "qv", "qc","T_phys", "qv_phys",
                                                       "qc_phys", "qv_micro", "T_micro"]]


#### Process into one shortened/concatenated dataset
Also top 15 levels

In [15]:
# %%time


# train_ds_list = [_get_multi_idx(b[vars_to_include].isel(z=slice(15, None)))
#                  for b in train_batches]
# test_ds_list = [_get_multi_idx(b[vars_to_include].isel(z=slice(15, None)))
#                  for b in test_batches]



CPU times: user 23.3 s, sys: 1.41 s, total: 24.7 s
Wall time: 4min 23s


In [143]:
temp_list = [ds.set_index(sample=['time', 'tile', 'x', 'y']).unstack() for ds in train_batches[:5]]
temp_list[0]

In [142]:
temp_cat = xr.concat(temp_list, dim='time')
temp_cat