If we want to compare to other methods we need to interpolate data back to regular grid. To be able to interpolate back to regular data, we need to create a .nc file that we can give to cdo.

Only process one dataset folder at a time.

In [15]:
# pylint: disable=E1101,R,C
import numpy as np
import gzip
import netCDF4 as nc
import matplotlib.pyplot as plt
import pickle
import numpy as np
import os

from icosahedron import Icosahedron, rand_rotation_icosahedron, rand_rotation_matrix, plot_voronoi, plot_voronoi_charts

In [16]:
DIRECTORY_DATASETS_INTERPOLATED = "Datasets/Interpolated/"
DIRECTORY_DATASETS_ORIGINAL = "Datasets/Original/"
DIRECTORY_IMAGES = "Images/"
DIRECTORY_SCRIPTS = "Scripts/"
DIRECTORY_OUTPUTS = "Output/Compare_UNet_architectures/"

In [17]:
def check_dict_conditions(dic, conditions):
    """
    Test whether dict "dic" fullfills the given conditions "conditions". 
    The latter are given as a array of key-value pairs.
    """
    for key, value in conditions:
        if key in dic.keys():
            if not dic[key] == value:
                return False
        else:
            return False
    return True

In [18]:
# dataset parameters. To leave unspecified use: "*"
PREFIX = "HadCM3-ico"
RESOLUTION = 5
DO_SHUFFLE = False
INTERPOLATION = "cons1"
INTERPOLATE_CORNERS = True
ALL_VARIABLES = np.sort(["temp_1","precip","dO18"]) #"*" # get all possible combinations # np.sort(["temp_1","precip","dO18","p"])
DSET_NR = "1"

# helping vars
shuffle_dict = {True:"shuffle", False:"no-shuffle", "*": "*"}
corners_dict = {True: "interp-corners", False: "zero-fill-corners", "*": "*"}

# wildcards can be used in this filename.
DATASET_FOLDER = "{}_{}_{}_{}_{}_{}_".format(PREFIX, RESOLUTION, shuffle_dict[DO_SHUFFLE], INTERPOLATION, 
                                         corners_dict[INTERPOLATE_CORNERS], DSET_NR)
DATASET_FOLDER = DATASET_FOLDER + "-".join(ALL_VARIABLES)
DATASET_FOLDER = os.path.join(DIRECTORY_OUTPUTS, DATASET_FOLDER)

if not os.path.exists(DATASET_FOLDER):
    raise OSError("There exists no folder for the given specifications")

    
DATASET_DESCRIPTION_FILE = os.path.join(DATASET_FOLDER, "dataset-description.gz")

with gzip.open(DATASET_DESCRIPTION_FILE, 'rb') as f:
    dataset_description = pickle.load(f)

    
    
model_descriptions = []
testset_predictions = []
model_descriptions_paths = []

# specify the conditions that we want the runs to match
conditions = []# [()"NUM_EPOCHS",3)]


subdirs = [d for d in os.listdir(DATASET_FOLDER) if os.path.isdir(os.path.join(DATASET_FOLDER, d))]

for subdir in subdirs:
    files = [f for f in os.listdir(os.path.join(DATASET_FOLDER, subdir)) if os.path.isfile(os.path.join(DATASET_FOLDER, subdir, f))]   
    if "model_training_description.gz" in files and "predictions.gz" in files:        
        # this is a valid description of a model run. Store path and print description
        with gzip.open(os.path.join(DATASET_FOLDER, subdir, "model_training_description.gz"), 'rb') as f:
            tmp_description = pickle.load(f)
            if check_dict_conditions(tmp_description, conditions):  # check if the description satisfies our conditions
                model_descriptions.append(tmp_description)      
                model_descriptions_paths.append(os.path.join(DATASET_FOLDER, subdir))
                print("Path: ", subdir)
                for key, value in model_descriptions[-1].items():
                    print(key,": ", value)
                print("\n")

                with gzip.open(os.path.join(DATASET_FOLDER, subdir, "predictions.gz"), 'rb') as g:
                    testset_predictions.append(pickle.load(g))       
    else:
        print("Encountered Invalid directory")

Path:  -0xb4eecf0bc51a89e
MODELTYPE :  UNet
S_MODE_PREDICTORS :  ('Pixelwise', 'Pixelwise')
S_MODE_TARGETS :  ('Pixelwise',)
RUN_NR :  5
NUM_EPOCHS :  early_stopping
BATCH_SIZE :  8
LEARNING_RATE :  0.005
DEPTH :  3
IN_CHANNELS :  2
CHANNELS_FIRST_CONV :  32
OUT_CHANNELS :  1
FMAPS :  (32, 32, 64, 64)
ACTIVATION :  <class 'torch.nn.modules.activation.ReLU'>
NORMALIZATION :  <class 'torch.nn.modules.batchnorm.BatchNorm3d'>
loss :  MSELoss
optimizer :  Adam
#params :  1884369


Path:  0x73a3d33d45d81cf6
MODELTYPE :  UNet
S_MODE_PREDICTORS :  ('Pixelwise', 'Pixelwise')
S_MODE_TARGETS :  ('Pixelwise',)
RUN_NR :  3
NUM_EPOCHS :  early_stopping
BATCH_SIZE :  8
LEARNING_RATE :  0.005
DEPTH :  3
IN_CHANNELS :  2
CHANNELS_FIRST_CONV :  32
OUT_CHANNELS :  1
FMAPS :  (32, 32, 64, 64)
ACTIVATION :  <class 'torch.nn.modules.activation.ReLU'>
NORMALIZATION :  <class 'torch.nn.modules.batchnorm.BatchNorm3d'>
loss :  MSELoss
optimizer :  Adam
#params :  1884369


Path:  -0x59c939c9644cd2d1
MODELTYPE :

# Step 1: Get required ground truth
(Requires grid_description only)

In [19]:
print("Target variable:", dataset_description["target_variables"])

required_files = []

if dataset_description["target_variables"][0] == "dO18":
    for filename in dataset_description["files_used"]:
        if "isotopes" in filename:
            required_files.append(filename)
else:
    raise NotImplementedError("Currently only d18O is a valid target variable.")

Target variable: ['dO18']


In [20]:
def get_dataset_dict(path_ds_6_nb, path_ds_5_nb=None):
    """
    Returns a dict that contains the Datasets stored in path_ds_5_nb and path_ds_6_nb. As the names suggest,
    these should be the the datasets containing all points with 5 and all points with six neighbors.
    If only one dataset is used (should be path_ds_6_nb), set the other argument to None.
    """
    import netCDF4
    assert "nbs_6" in path_ds_6_nb
    datasets = {}
    
    datasets["6_nb"] = netCDF4.Dataset(path_ds_6_nb, "a")
    if path_ds_5_nb is not None:
        datasets["5_nb"] = netCDF4.Dataset(path_ds_5_nb, "a")
    return datasets


def get_indices_charts_shape(res):
    """
    Get indices of the two groups: Pixels that have 5 nbs and pixels that have 6 nbs.
    Also return the shape of charts bc we need it later
    """
    ico = Icosahedron(r=res)
    regions, vertices = ico.get_voronoi_regions_vertices()
    charts = ico.get_charts_cut()
    indices_six_nb = []
    indices_five_nb = []
    for i in range(len(regions)):
        if len(regions[i])>5:
            indices_six_nb.append(i)
        else:
            indices_five_nb.append(i)
    # create numpy arrays
    indices_six_nb = np.array(indices_six_nb)
    indices_five_nb = np.array(indices_five_nb)
    return indices_five_nb, indices_six_nb, charts.shape


def combine_datasets(dataset_dict, indices_five_nb, indices_six_nb):
    """
    We need to combine the datasets from the seperate files for points with 5 nbs and points with 6 nbs.
    If there only is a file with six-neighbor points, we fill with zeros. 
    """
    assert "6_nb" in dataset_dict.keys()
    combined_data = np.zeros(dataset_dict["6_nb"].shape[:-1] + (dataset_dict["6_nb"].shape[-1]+10,))
    if "5_nb" in dataset_dict.keys():
        combined_data[:,indices_six_nb] = dataset_dict["6_nb"]
        combined_data[:,indices_five_nb] = dataset_dict["5_nb"]
    else:
        combined_data[:,indices_six_nb] = dataset_dict["6_nb"]
        combined_data[:,indices_five_nb] = 0
    return combined_data

In [21]:
isotopes = get_dataset_dict(required_files[0], required_files[1])

In [22]:
# names of datasets to which we want not to be missing at any timestep.
dnames = [DIRECTORY_DATASETS_ORIGINAL+"xnapa_isotopes.nc",\
          DIRECTORY_DATASETS_ORIGINAL+"xnapa_precip.nc",\
          DIRECTORY_DATASETS_ORIGINAL+"xnapa_slp.nc",\
          DIRECTORY_DATASETS_ORIGINAL+"xnapa_temp.nc"]

def get_shared_timesteps(dataset_names):
    """
    Not all datasets share the same timesteps. The biggest problems occur in the slp dataset. We want to exclude all
    time steps where one of the variables is missing
    """
    
    # get indices of elements that are shared for all variables.
    from functools import reduce
    ts = tuple([nc.Dataset(dataset_name,"a").variables["t"][:].data for dataset_name in dataset_names])
    common_dates = reduce(np.intersect1d, ts)
    
    return common_dates

In [23]:
d18O = {}


t = isotopes["6_nb"].variables["t"][:].data
t_bnds = isotopes["6_nb"].variables["t_bnds"][:]


for name, dset in isotopes.items():  
    d18O[name] = np.squeeze(dset.variables["dO18"][:])
    
for name, array in d18O.items():  
    c_dates = get_shared_timesteps(dnames)
    # get the corresponding indices:
    indices = []
    for j, t_ in enumerate(isotopes["6_nb"].variables["t"][:].data):
        if t_ in c_dates:
            indices.append(j)
    indices = np.array(indices)
    index_mask = np.logical_and(isotopes["6_nb"].variables["t"][indices].data // 360 >= 654, \
                                isotopes["6_nb"].variables["t"][indices].data // 360 < 1654)    
    indices = indices[index_mask]

    d18O[name] = array[indices,...]
    
indices_five_nb, indices_six_nb, cs = get_indices_charts_shape(dataset_description["RESOLUTION"])
d18O = combine_datasets(d18O, indices_five_nb, indices_six_nb)
d18O = d18O.reshape(d18O.shape[0],-1,cs[-2])

In [24]:
d18O_ico_train = d18O[dataset_description["indices_train"],...]
t_train = t[dataset_description["indices_train"],...]
t_bnds_train = t_bnds[dataset_description["indices_train"],...]

d18O_ico_test = d18O[dataset_description["indices_test"],...]
t_test = t[dataset_description["indices_test"],...]
t_bnds_test = t_bnds[dataset_description["indices_test"],...]

## 1 a) Undo the standardization:

In [25]:
rescaled_predictions = []

for i, description in enumerate(model_descriptions):
    rescaled_predictions.append([])
    for j, mode in enumerate(description["S_MODE_TARGETS"]):
        if mode == "Global":
            std = np.mean(np.std(d18O_ico_train, axis=(0,), keepdims=True), axis=(2,3), keepdims=True)
            std[std==0] = 1
            mean = np.mean(d18O_ico_train, axis=(0,2,3), keepdims=True)
            rescaled_predictions[-1].append((testset_predictions[i]["predictions"][:,j,...] * std) + mean)             
        elif mode == "Pixelwise":        
            std = np.std(d18O_ico_train, axis=(0), keepdims=True)
            std[std==0] = 1
            mean = np.mean(d18O_ico_train, axis=(0), keepdims=True)
            rescaled_predictions[-1].append((testset_predictions[i]["predictions"][:,j,...] * std) + mean)   
        elif mode == "Global_mean_pixelwise_std":
            std = np.mean(np.std(d18O_ico_train, axis=(0,), keepdims=True), axis=(2,3), keepdims=True)
            std[std==0] = 1
            mean = np.mean(d18O_ico_train, axis=(0), keepdims=True)
            rescaled_predictions[-1].append((testset_predictions[i]["predictions"][:,j,...] * std) + mean)           
        elif mode == "Pixelwise_mean_global_std":
            std = np.std(d18O_ico_train, axis=(0), keepdims=True)
            std[std==0] = 1
            mean = np.mean(d18O_ico_train, axis=(0,2,3), keepdims=True)
            rescaled_predictions[-1].append((testset_predictions[i]["predictions"][:,j,...] * std) + mean)  
        elif mode == "None":
            rescaled_predictions[-1].append(testset_predictions[i]["predictions"])
        else:
            raise NotImplementedError("{} is not a valid keyword for standardization".format(mode))
    rescaled_predictions[-1] = np.stack(rescaled_predictions[-1], axis=1)

## 2 a) Split the dataset into pixels with five and six neighbors

In [26]:
rescaled_predictions_five_nb = np.zeros((len(rescaled_predictions), d18O_ico_test.shape[0], len(indices_five_nb)))
rescaled_predictions_six_nb = np.zeros((len(rescaled_predictions), d18O_ico_test.shape[0], len(indices_six_nb)))

for i in range(len(rescaled_predictions)):
    rescaled_predictions[i] = rescaled_predictions[i].reshape(d18O_ico_test.shape[0],-1)
    rescaled_predictions_five_nb[i] = rescaled_predictions[i][:,indices_five_nb]
    rescaled_predictions_six_nb[i] = rescaled_predictions[i][:,indices_six_nb]

## 2 b) Save these as two seperate .nc files.

In [27]:
required_files

['Datasets/Interpolated/xnapa_isotopes_r_5_nbs_6_cons1.nc',
 'Datasets/Interpolated/xnapa_isotopes_r_5_nbs_5_cons1.nc']

In [28]:
tocopy = ['lon', 'lon_bnds', 'lat', 'lat_bnds']
dimscopy =['t', 'bnds', 'ncells','vertices']

paths_5_nbs = []
paths_6_nbs = []


for i, model_description_path in enumerate(model_descriptions_paths):
    for filename in required_files:
        original_dimensions  = nc.Dataset(filename).variables["dO18"].dimensions
        necessary_dimensions = (original_dimensions[0], original_dimensions[2])
        original_dataype     = nc.Dataset(filename).variables["dO18"].datatype

        if "nbs_6" in filename:
            netcdf4_path = os.path.splitext(model_description_path)[0]+"_6_nbs.nc"
            paths_6_nbs.append(netcdf4_path)
        elif "nbs_5" in filename:
            netcdf4_path = os.path.splitext(model_description_path)[0]+"_5_nbs.nc"     
            paths_5_nbs.append(netcdf4_path)
        src = nc.Dataset(filename)
        dst = nc.Dataset(netcdf4_path, "w")
        # copy global attributes all at once via dictionary
        dst.setncatts(src.__dict__)
        # copy dimensions
        for name, dimension in src.dimensions.items():
            if name in dimscopy:
                dst.createDimension(name, (len(dimension) if not dimension.isunlimited() else None))
        # copy all file data except for the excluded
        for name, variable in src.variables.items():
            if name in tocopy:
                x = dst.createVariable(name, variable.datatype, variable.dimensions)
                dst[name][:] = src[name][:]
                # copy variable attributes all at once via dictionary
                dst[name].setncatts(src[name].__dict__)

        target_var_attribute_dict = nc.Dataset(filename).variables["dO18"].__dict__
        dst.createVariable("dO18", original_dataype, necessary_dimensions)
        dst.variables["dO18"].setncatts(target_var_attribute_dict)
        dst.createVariable("t", "float64", ("t"))
        dst.createVariable("t_bnds", "float64", ("t","bnds"))
        dst.variables["t"][:] = t_test
        dst.variables["t_bnds"][:] = t_bnds_test


        if "nbs_6" in filename:
            dst.variables["dO18"][:] = rescaled_predictions_six_nb[i]
        elif "nbs_5" in filename:
            dst.variables["dO18"][:] = rescaled_predictions_five_nb[i]  

        # print(dst.)
        dst.close()
        src.close()