Running the classical baseline for different number of principle components.

Was done as an ablation experiment in the Appendix of the thesis. Results are evaluated in 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

import netCDF4
import gzip
import pickle

import os

import math
import numpy as np
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
import scipy

from sklearn.model_selection import train_test_split

from icosahedron import Icosahedron, rand_rotation_icosahedron, rand_rotation_matrix, plot_voronoi, plot_voronoi_charts

# Get the dataset for given specifications we are interested in.

In [57]:
# set up a dictionary to store information on the training procedure and the used model
model_training_description = {}
model_training_description["MODELTYPE"] = "Classical_flat_PCA" 

#  automatically load dataset given specifications
PREFIX = "HadCM3-flat"
DO_SHUFFLE = False
DSET_NR = 1

ALL_VARIABLES = np.sort(["temp_1", "precip", "dO18"])#,"p"])

shuffle_dict = {True:"shuffle", False:"no-shuffle"}
corners_dict = {True: "interp-corners", False: "zero-fill-corners"}

DATASET_FOLDER = "{}_{}_{}_".format(PREFIX, shuffle_dict[DO_SHUFFLE], DSET_NR)
DATASET_FOLDER = DATASET_FOLDER + "-".join(ALL_VARIABLES)

DIRECTORY_DATASETS_INTERPOLATED = "Datasets/Interpolated/"  # directory where the interpolated dataset is stored
DIRECTORY_IMAGES = "Images/"  # directory where we want to store plots produced by the notebook
DIRECTORY_OUTPUTS = "Output/Compare_n_pc"  # directory where the want to store the outpout

DATASET_FOLDER = os.path.join(DIRECTORY_OUTPUTS, DATASET_FOLDER)

if not os.path.exists(DATASET_FOLDER):
    raise OSError("There exists no folder for the given specifications")

# load the dataset from its directory
DATASET = os.path.join(DATASET_FOLDER, "dataset.gz")

In [58]:
# select what standardization we want for each of the variables.
model_training_description["S_MODE_PREDICTORS"] = ("Pixelwise", "Pixelwise")
model_training_description["S_MODE_TARGETS"] = ("Pixelwise",)
model_training_description["REGTYPE"] = "lasso"

In [59]:
def load_data(path, s_mode_predictors, s_mode_targets):
    """
    Load data from file specified by path, prepare loaders with batchsize batch_size.
    Standardize the data with the given standardization mode
    """

    with gzip.open(path, 'rb') as f:
        dataset = pickle.load(f)

    train_predictors = dataset["train"]["predictors"].astype(np.float32)
    train_targets = dataset["train"]["targets"].astype(np.float32)
    test_predictors = dataset["test"]["predictors"].astype(np.float32)
    test_targets = dataset["test"]["targets"].astype(np.float32)
    
    n_predictors = train_predictors.shape[1]
    n_targets = train_targets.shape[1]
    w = train_predictors.shape[-1]
    h = train_predictors.shape[-2]
    
    # assert that standardize mode has one element for each variable.
    assert len(s_mode_predictors) == n_predictors
    assert len(s_mode_targets) == n_targets
    assert all([mode in ["None", "Pixelwise", "Global_mean_pixelwise_std", "Pixelwise_mean_global_std", "Global"] for mode in s_mode_predictors]) 
    assert all([mode in ["None", "Pixelwise", "Global_mean_pixelwise_std", "Pixelwise_mean_global_std", "Global"] for mode in s_mode_targets]) 


    # predictors: 
    for i, mode in enumerate(s_mode_predictors):    
        if mode == "Global": # Global normalization: Use same standard deviation for each pixel
            mean = np.mean(train_predictors[:,i,...],axis=(0,1,2), keepdims=True)
            std = np.mean(torch.std(train_predictors[:,i,...], axis=(0), keepdims=True), axis=(1,2), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Global_mean_local_std": # Subtract the global mean, but divide by local standard deviation
            mean = np.mean(train_predictors[:,i,...],axis=(0,1,2), keepdims=True)
            std = np.std(train_predictors[:,i,...],axis=(0), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Pixelwise_mean_global_std": # Subtract the global mean, but divide by local standard deviation
            mean = np.mean(train_predictors[:,i,...],axis=(0), keepdims=True)
            std = np.mean(torch.std(train_predictors[:,i,...], axis=(0), keepdims=True), axis=(1,2), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Pixelwise":  # Subtract pixelwise mean and ivide each pixel by its own standard deviation
            mean = np.mean(train_predictors[:,i,...], axis=(0), keepdims=True)
            std = np.std(train_predictors[:,i,...],axis=(0), keepdims=True)                             
            std[std==0] = 1 # avoid dividing by zero

        train_predictors[:,i,...] = (train_predictors[:,i,...] - mean)/std
        test_predictors[:,i,...] = (test_predictors[:,i,...] - mean)/std

    # targets: 
    for i, mode in enumerate(s_mode_targets):
        if mode == "Global": # Global normalization: Use same standard deviation for each pixel
            mean = np.mean(train_targets[:,i,...], axis=(0,1,2), keepdims=True)
            std = np.mean(torch.std(train_targets[:,i,...], axis=(0), keepdims=True), axis=(1,2), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Global_mean_local_std": # Subtract the global mean, but divide by local standard deviation
            mean = np.mean(train_targets[:,i,...],axis=(0,1,2), keepdims=True)
            std = np.std(train_targets[:,i,...],axis=(0), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Pixelwise_mean_global_std": # Subtract the local mean, but divide by global standard deviation
            mean = np.mean(train_targets[:,i,...],axis=(0), keepdims=True)
            std = np.mean(torch.std(train_targets[:,i,...], axis=(0), keepdims=True), axis=(1,2), keepdims=True)
            std[std==0] = 1 # avoid dividing by zero
            
        elif mode == "Pixelwise":  # Subtract pixelwise mean and ivide each pixel by its own standard deviation
            mean = np.mean(train_targets[:,i,...], axis=(0), keepdims=True)
            std = np.std(train_targets[:,i,...],axis=(0), keepdims=True)                            
            std[std==0] = 1 # avoid dividing by zero   
        
        train_targets[:,i,...] = (train_targets[:,i,...] - mean)/std
        test_targets[:,i,...] = (test_targets[:,i,...] - mean)/std 

    return train_predictors, train_targets, test_predictors, test_targets

In [60]:
def train_global_model(X_train, Y_train):
    from sklearn.linear_model import LinearRegression
    """get the trained model"""
    regressor = LinearRegression().fit(X_train, Y_train)
    return regressor

def train_lasso(X_train, Y_train):
    """get the trained LASSO model"""
    from sklearn.linear_model import MultiTaskLassoCV
    lasso = MultiTaskLassoCV().fit(X_train, Y_train)
    return lasso

def train_onedim_lasso(X_train, Y_train):
    """get trained LASSO with one-dimensional output"""    
    from sklearn.linear_model import LassoCV
    lasso = LassoCV().fit(X_train, Y_train)
    return lasso    
    
def predict_with_model(model, X_test, Y_test):
    return model.predict(X_test), Y_test

In [61]:
# write a wrapper for all the rescaling pca etc

def train_pca(X_tr, Y_tr, regtype=model_training_description["REGTYPE"], n_pc_in=20, n_pc_target=20):
    """ 
    Train PCA and regression model on the training data. In opposition to the version in the Jonathan_PCA_methods notebook, 
    we don't rescale here seperately, rescaling is already done in the dataloader.
    Assume inputdata of shape (n_timesteps, n_variables, n_lat, n_lon).
    """
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    
    X_train = X_tr.reshape(X_tr.shape[0],-1)
    Y_train = Y_tr.reshape(Y_tr.shape[0],-1)
    
    # PCA
    pca = PCA(n_components=n_pc_in)
    principalComponents = pca.fit_transform(X_train)
    
    pca_targets = PCA(n_components=n_pc_target)
    principalComponents_targets = pca_targets.fit_transform(Y_train)
    
    # Get the model 
    if regtype == 'lasso':
        model = train_lasso(principalComponents, principalComponents_targets)
    elif regtype == 'linreg':
        model = train_global_model(principalComponents, principalComponents_targets)
    else:
        print("This regression model is currently not implemented.")
    return pca, pca_targets, model

def test_pca_scaling(X_te, Y_te, pca, pca_targets, model):
    """
    Assume inputdata of shape (n_timesteps, n_variables, n_lat, n_lon)
    """
    X_test = X_te.reshape(X_te.shape[0],-1)
    Y_test = Y_te.reshape(Y_te.shape[0],-1)

    X_test_rescaled = pca.transform(X_test)
    predict_test = model.predict(X_test_rescaled)
    predict_test = pca_targets.inverse_transform(predict_test)
    
    return predict_test.reshape(*Y_te.shape)

In [62]:
n_pc_out = np.logspace(np.log10(3),np.log10(300),10).astype("int")
n_pc_in = (1.5*np.logspace(np.log10(3),np.log10(300),10)).astype("int")

In [63]:
for i in range(len(n_pc_in)):
    print(i)
    model_training_description["n_pc_in"] = n_pc_in[i]
    model_training_description["n_pc_target"] = n_pc_out[i]
    for j in range(3):
        model_training_description["RUN_NR"] = j
        
        
        train_predictors, train_targets, test_predictors, test_targets = load_data(DATASET, s_mode_predictors=model_training_description["S_MODE_PREDICTORS"],\
                                                                                   s_mode_targets=model_training_description["S_MODE_TARGETS"])
        pca, pca_targets, pca_model = train_pca(train_predictors, 
                                                train_targets, 
                                                regtype=model_training_description["REGTYPE"], 
                                                n_pc_in=model_training_description["n_pc_in"], 
                                                n_pc_target=model_training_description["n_pc_target"])
        
        predict_targets = test_pca_scaling(test_predictors, 
                                           test_targets, 
                                           pca, pca_targets, pca_model)

        hash_value = hex(hash(frozenset(model_training_description.items())))
        model_name = "classical_pca_"
        
        run_directory = os.path.join(DATASET_FOLDER, model_name + hash_value)
        if os.path.exists(run_directory):
            raise FileExistsError("Hash collision. Probably a model run with this configuration already exists.")
        else:
            os.makedirs(run_directory)
            print("writing model and training description")
            with gzip.open(os.path.join(run_directory, "model_training_description.gz"), 'wb') as f:
                pickle.dump(model_training_description, f)
            print("writing model predictions")
            with gzip.open(os.path.join(run_directory, "predictions.gz"), 'wb') as f:
                pickle.dump({"predictions": predict_targets.reshape(test_targets.shape)}, f)        
            print("done")

0
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
1
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
2
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
3
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
4
writing model and training description
writing model predictions
done
writing model and training description
writing model predictions
done
writing mo