In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from pathlib import Path
from typing import Literal

from datautilities import meshmethods as mm
from datautilities import MeshIO as io

In [2]:
lr_res = "LR8_raw_6cpu"
hr_res = "HR_6cpu"
train_years = [1959, 1960, 1961, 1962, 1963]
test_years = 1964

add_var = "Bat"

regions = [1, 2, 3]
region_pattern = "Region{}LR4"

root_dirs = [Path("../Data") / region_pattern.format(nreg) for nreg in regions]

In [3]:
class PreProcessing:
    def __init__(self):
        self.mean = None
        self.std = None

        self.pca = None

    @staticmethod
    def delete_nans(X: np.ndarray):
        """X has to be of shape (nsamples, nfeatures)"""
        nan_idx = np.nonzero(np.isnan(X).all(axis=0))[0]
        return np.delete(X, nan_idx, axis=1)

    def znormalize(self, X: np.ndarray):
        if self.mean is None or self.std is None:
            raise ValueError(
                ("Either 'self.mean or self.std is 'None'. Precompute them "
                 "by calling znormalize_fit first.")
            )
        return (X - self.mean) / self.std

    def znormalize_fit(self, X: np.ndarray):
        """Precompute mean and std of data set for later normalization"""
        self.mean = np.nanmean(X, axis=0)
        self.std = np.nanstd(X, axis=0)

    def pca_transform(self, X: np.ndarray):
        """Wrapper around sklearn's 'PCA.transform()' method"""
        if self.pca is None:
            raise ValueError(
                "No PCA instance found. Call pca_fit first to transform data."
            )
        return self.pca.transform(X)

    def pca_fit(self, X: np.ndarray, **kwargs):
        """Fit PCA instance. Kwargs are passed to pca initialization."""
        pca = PCA(**kwargs)
        self.pca = pca.fit(X)

    @staticmethod
    def polynomial_transform(X: np.ndarray, degree: int = 2):
        return PolynomialFeatures(degree=degree).fit_transform(X)

def preprocessing_pipeline(
    pp: PreProcessing,
    X: np.ndarray,
    y: np.ndarray,
    nreg: int,
    mode: Literal["train", "test"] = "train",
    delete_nans: bool = True,
    polynomials: int = 2,
    znormalize: bool = True,
    pca_transform: bool = True,
    **kwargs
) -> tuple[PreProcessing, np.ndarray, np.ndarray]:
    """Pipeline that processes input and target data. The preprocessing
    class stores the data that is necessary to perform later transforms.

    Parameters
    ----------
    pp : PreProcessing
        PreProcessing instance that is used to store or retrieve parameters
        to correctly preprocess the data
    X : np.ndarray
        Input data to be preprocessed. Assumed shape: (nsamples, nfeatures)
    y : np.ndarray
        Target data to be preprocessed. Assumed shape: (nsamples, nfeatures)
    nreg : int
        Number of the region. Used to determine the root directory.
    mode : Literal['train', 'test'], optional
        If data should be fitted or just transformed, by default "train"
    delete_nans : bool
        If delete_nans should be called. Otherwise nan_to_num is used
        to treat nan data. By default True
    polynomials : int
        If 0, no polynomials are added. If > 0 then this is used as the
        degree of the polynomials. By default 0
    znormalize : bool
        If input should be znormalized, by default True
    pca_transform : bool
        If input should be pca-transformed, by default True
    **kwargs :
        Keyword arguments are passed to pca_fit

    Returns
    -------
    tuple[PreProcessing, np.ndarray, np.ndarray]
        Returns 'PreProcessing' instance with the stored parameters and the
        transformed inputs and targets
    """
    if delete_nans:
        X = pp.delete_nans(X)
        # y = pp.delete_nans(y)

        # Workaround for differing nan data in different variables
        y = np.delete(y, np.load(f"../Data/Region{nreg}LR4/HR_6cpu/drop_index.npy"),
                      axis=1)
    else:
        X = np.nan_to_num(X)
        y = np.nan_to_num(y)

    if polynomials:
        X = pp.polynomial_transform(X, degree=polynomials)

    if mode == "train":
        pp.znormalize_fit(X)
        pp.pca_fit(X, **kwargs)

    if znormalize:
        X = pp.znormalize(X)
    if pca_transform:
        X = pp.pca_transform(X)

    return pp, X, y


In [4]:
diffs = []
y_tests = []
for idx, root_dir in enumerate(root_dirs):
    pp = PreProcessing()

    X = mm.load_data(res=lr_res, years=train_years, root_dir=root_dir)
    if add_var == "Bat":
        X_add = np.empty_like(X)
        X_add[:] = io.load_mesh(root_dir / "LR8_mesh.pickle").values
    else:
        X_add = mm.load_data(res=lr_res, years=train_years, root_dir=root_dir, var=add_var)
        X_add = X_add.reshape((X_add.shape[0], -1))
    y = mm.load_data(res=hr_res, years=train_years, root_dir=root_dir)

    kwargs = dict(
        delete_nans=True,
        polynomials=2,
        pca_transform=False,
        znormalize=False,
        whiten=False,
    )

    kwargs_add = dict(
        delete_nans=True,
        polynomials=1,
        pca_transform=False,
        znormalize=False,
        whiten=False,
    )

    # pp, X, y = preprocessing_pipeline(pp, X, y, idx+1, **kwargs)
    pp, X, _ = preprocessing_pipeline(pp, X, y, idx+1, **kwargs)
    _, X_add, y = preprocessing_pipeline(pp, X_add, y, idx+1, **kwargs_add)

    reg = Ridge(0.005, fit_intercept=False)
    reg = reg.fit(np.concatenate([X, X_add], axis=1), y)

    X_test = mm.load_data(res=lr_res, years=test_years, root_dir=root_dir)
    if add_var == "Bat":
        X_add_test = np.empty_like(X_test)
        X_add_test[:] = io.load_mesh(root_dir / "LR8_mesh.pickle").values
    else:
        X_add_test = mm.load_data(var=add_var, res=lr_res, years=test_years,
                                root_dir=root_dir)[:2919]
        X_add_test = X_add_test.reshape((X_add_test.shape[0], -1))
    y_test = mm.load_data(res=hr_res, years=test_years, root_dir=root_dir)

    # pp, X_test, y_test = preprocessing_pipeline(
    #         pp, X_test, y_test, idx+1, mode="test", **kwargs
    #     )
    pp, X_test, _ = preprocessing_pipeline(
            pp, X_test, y_test, idx+1, mode="test", **kwargs
        )
    _, X_add_test, y_test = preprocessing_pipeline(
            pp, X_add_test, y_test, idx+1, mode="test", **kwargs_add
        )

    diffs.append(np.abs(reg.predict(
        np.concatenate([X_test, X_add_test], axis=1)) - y_test)
    )
    y_tests.append(y_test)

  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()
  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()
  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()


In [4]:
# Use only Hs as an input
diffs = []
y_tests = []
for idx, root_dir in enumerate(root_dirs):
    pp = PreProcessing()

    X = mm.load_data(res=lr_res, years=train_years, root_dir=root_dir)
    y = mm.load_data(res=hr_res, years=train_years, root_dir=root_dir)

    kwargs = dict(
        delete_nans=True,
        polynomials=2,
        pca_transform=False,
        znormalize=False,
        whiten=False,
    )

    pp, X, y = preprocessing_pipeline(pp, X, y, idx+1, **kwargs)

    reg = Ridge(0.005, fit_intercept=False)
    reg = reg.fit(X, y)

    X_test = mm.load_data(res=lr_res, years=test_years, root_dir=root_dir)
    y_test = mm.load_data(res=hr_res, years=test_years, root_dir=root_dir)

    pp, X_test, y_test = preprocessing_pipeline(
            pp, X_test, y_test, idx+1, mode="test", **kwargs
        )

    diffs.append(np.abs(reg.predict(X_test) - y_test))
    y_tests.append(y_test)

  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()
  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()
  data[idx] = np.load(fpath).squeeze()
  ret = a @ b
  data[idx] = np.load(fpath).squeeze()


In [5]:
# Save the results
out_path = Path("data") / "diffs" / f"diffs-hs-{add_var}.pkl"
if not out_path.exists():
    out_path.parent.mkdir(parents=True, exist_ok=True)
pickle.dump(diffs, open(out_path, "wb"))

# Load the data for the different inputs and regions

In [9]:
keys = ["only", "Bat", "Tm10", "Dir"]

# Load the differences, compute mae, rmse, and max error and save in dataframe
results = {}
for key in keys:
    out_path = Path("data") / "diffs" / f"diffs-hs-{key}.pkl"
    diffs = pickle.load(open(out_path, "rb"))

    regional_results = {}
    for idx, diff in enumerate(diffs):
        regional_results[f"region{regions[idx]}"] = {
            "mae": np.mean(diff)*100,
            "rmse": np.sqrt(np.mean(diff**2))*100,
            "max_error": np.max(diff)*100
        }
        results[key] = regional_results



In [10]:
# Reform the dictionary to right format for multiindex dataframe
reform = {(outerKey, innerKey): values for outerKey, innerDict in results.items() for innerKey, values in innerDict.items()}
df = pd.DataFrame(reform)

In [13]:
df.xs("region1", level=1, axis=1)

Unnamed: 0,only,Bat,Tm10,Dir
mae,1.316864,1.316864,1.301068,1.185356
rmse,2.052122,2.052123,2.021442,1.877908
max_error,24.802965,24.802965,24.718948,24.475294


In [8]:
# Extract the mae over different regions
df.loc["mae"].unstack(level=1)#.plot(marker="s")

Unnamed: 0,region1,region2,region3
Bat,0.013169,0.00412,0.001618
Dir,0.011854,0.00355,0.001569
Tm10,0.013011,0.004063,0.001606
only,0.013169,0.00412,0.001618
