In [3]:
import xarray as xr
import pickle

import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

from tqdm import tqdm

import dask
import dask.array as da

In [4]:
models = ['MIROC6', 'CESM2', 'CanESM5', 'MIROC-ES2L', 'MPI-ESM1-2-LR']

In [5]:
model = 'CanESM5'
var = 'tas'

def load_data(model='CanESM5', var='tas', dir_path='../data/'):
    # Construct the filename
    filename = '{}_{}.nc'.format(model, var)
    # Load the NetCDF file using xarray
    data = xr.open_dataset(dir_path + filename)
    return data

def fit_pca_exp_var(X, exp_var=0.99):
    pca = PCA()
    pca.fit(X)
    explained_variance_ratio_cumulative = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(explained_variance_ratio_cumulative >= exp_var) + 1
    pca_model = PCA(n_components=n_components)
    pca_model.fit(X)

    return pca_model.components_, n_components

## Generate PCA for each memeber of each model

In [4]:
pca_weights = {}
for model in models:
    data = load_data(model)
    shape = data['tas'].shape
    data = data['tas'].values.reshape(shape[0], shape[1], shape[2]*shape[3])
    pca_weights[model] = None
    for data_member in tqdm(data) :
        if pca_weights[model] is None:
            pca_weights[model], _ = fit_pca_exp_var(data_member)
        else :
            pca_weights[model] = np.vstack((pca_weights[model], fit_pca_exp_var(data_member)[0]))
        

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [09:37<00:00, 14.45s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [11:54<00:00, 14.29s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [06:02<00:00, 14.51s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [06:55<00:00, 13.86s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [09:21<00:00, 18.71s/it]


In [7]:
# Filepath to save the pickle file
pickle_filepath = '../data/weights_pca_byModels.pkl'

# Save the dictionary to a pickle file
with open(pickle_filepath, 'wb') as f:
    pickle.dump(pca_weights, f)

## Loading all PCs

In [3]:
# Filepath to save the pickle file
pickle_filepath = '../data/weights_pca_byModels.pkl'

# Load the dictionary from the pickle file
with open(pickle_filepath, 'rb') as f:
    pca_weights = pickle.load(f)


In [4]:
pca_weights['CanESM5'].shape

(12227, 10368)

In [6]:
pca_tot = None
for model in models:
    if pca_tot is None:
        pca_tot = pca_weights[model]
    else:
        pca_tot = np.vstack((pca_tot, pca_weights[model]))

In [7]:
# Filepath to save the pickle file
pickle_filepath = '../data/weights_pca_tot.pkl'

# Save the dictionary to a pickle file
with open(pickle_filepath, 'wb') as f:
    pickle.dump(pca_tot, f)

## Averaging all PCs

In [4]:
# Filepath to save the pickle file
pickle_filepath = '../data/weights_pca_tot.pkl'

# Load the dictionary from the pickle file
with open(pickle_filepath, 'rb') as f:
    pca_weights_tot = pickle.load(f)


In [5]:
pca_weights_tot.shape

(91009, 10368)

In [10]:
weights_da = da.from_array(pca_weights_tot, chunks=(500, 500)).persist()
u, s, v = da.linalg.svd_compressed(weights_da, k=ncp)
u_svd = u.compute()

In [13]:
# Filepath to save the pickle file
pickle_filepath = '../data/u_svd_{}.pkl'.format(ncp)

# Save the dictionary to a pickle file
with open(pickle_filepath, 'wb') as f:
    pickle.dump(u_svd, f)

# Checking reconstruction error for each model

In [6]:
ncp = 1700
# Filepath to save the pickle file
pickle_filepath = '../data/u_svd_{}.pkl'.format(ncp)

# Load the dictionary from the pickle file
with open(pickle_filepath, 'rb') as f:
    U_svd = pickle.load(f)


In [7]:
U_svd.shape

(10368, 1700)

In [8]:
R2_score = {}
for model in tqdm(models):
    data = load_data(model)
    shape = data['tas'].values.shape
    data = data['tas'].values.reshape(shape[0] * shape[1], shape[2]*shape[3])
    score = r2_score(data, data @ U_svd @ U_svd.T)
    R2_score[model] = score
    print('{} : {:.4f}'.format(model, score))

 20%|███████████▌                                              | 1/5 [01:25<05:42, 85.68s/it]

MIROC6 : 0.8693


 40%|██████████████████████▊                                  | 2/5 [03:16<05:01, 100.64s/it]

CESM2 : 0.8841


 60%|██████████████████████████████████▊                       | 3/5 [04:12<02:40, 80.05s/it]

CanESM5 : 0.8647


 80%|██████████████████████████████████████████████▍           | 4/5 [05:20<01:15, 75.21s/it]

MIROC-ES2L : 0.8934


100%|██████████████████████████████████████████████████████████| 5/5 [07:02<00:00, 84.58s/it]

MPI-ESM1-2-LR : 0.8811





In [None]:
R2_score

In [8]:
pred = data @ U_svd @ U_svd.T

In [9]:
norm_squared_error = np.square(data - pred)/data.std(axis=0)

In [10]:
norm_squared_error.mean()

0.124717146

R scores with 500 components:
* 'MIROC6': 0.15463808841121343
* 'CESM2': 0.25730109286952235
* 'CanESM5': 0.16130914040262692
* 'MIROC-ES2L': 0.24384597125343987
* 'MPI-ESM1-2-LR': 0.2719510349765452

R scores with 1000 components:
* 'MIROC6': 0.6283942208620207
* 'CESM2': 0.6675412160174118
* 'CanESM5': 0.6126835256801059
* 'MIROC-ES2L': 0.6761468489553218
* 'MPI-ESM1-2-LR': 0.670252727064077

R scores with 1500 components:
* 'MIROC6': 0.8291237074387285,
* 'CESM2': 0.8465464364503251,
* 'CanESM5': 0.8220316854023593,
* 'MIROC-ES2L': 0.858213227174884,
* 'MPI-ESM1-2-LR': 0.8450816809323174