In [2]:
from matplotlib import pyplot as plt

import xarray as xr
import netCDF4 as nc
import numpy as np

import os

import datetime as dt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score
import random
from tqdm import tqdm
from collections import Counter

In [3]:
models = ['CanESM5', 'MIROC-ES2L', 'MPI-ESM1-2-LR', 'MIROC6', 'CESM2']

In [4]:
def load_model_data(model, var, path='../data/'):
    # Loading data file
    file_path = os.path.join(path, '{}_{}.nc'.format(model, var))
    ds = xr.open_dataset(file_path)
    # Getting TAS
    tas_array = ds[var].values
    # Close the dataset
    ds.close()
    return tas_array

def get_data_shape_lat_lon(model='CanESM5', var='tas', path='../data/'):
    file_path = os.path.join(path, '{}_{}.nc'.format(model, var))
    ds = xr.open_dataset(file_path)
    # Getting TAS
    shape = ds[var].values.shape
    # Close the dataset
    ds.close()
    return shape, ds['lat'], ds['lon']

def load_data_models(models, var='tas', n_sample=10, path='../data/'):
    X, y = None, None
    for model in models:
        tas_array = load_model_data(model, var=var, path=path)
        shape = tas_array.shape
        
        idxs = random.sample(range(shape[0]*shape[1]), n_sample)
        X_temp = tas_array.reshape(shape[0]*shape[1], shape[2]*shape[3])
        y_temp = np.tile(tas_array.mean(axis=0), (shape[0], 1, 1)) .reshape(shape[0]*shape[1], shape[2]*shape[3])

        if X is None:
            X = X_temp[idxs,:]
            y = y_temp[idxs,:]
        else :
            X = np.vstack((X, X_temp[idxs,:]))
            y = np.vstack((y, y_temp[idxs,:]))
        del tas_array
        del X_temp
        del y_temp
    return X, y

In [5]:
shape, lat, lon = get_data_shape_lat_lon()
d = shape[2]*shape[3]

In [6]:
B = 25
models_test = random.choices(models, k=B)
models_train = [[model for model in models if model != model_test ] for model_test in models_test ]

In [7]:
occurence_models_test = Counter(models_test)

In [8]:
occurence_models_test

Counter({'MPI-ESM1-2-LR': 3,
         'MIROC-ES2L': 2,
         'CanESM5': 6,
         'MIROC6': 7,
         'CESM2': 7})

In [11]:
N = 1000
n_alpha = 10
alphas = np.logspace(2, 7, n_alpha)

In [None]:
X_train, Y_train = load_data_models(models_train[0], n_sample=N)
X_test, Y_test = load_data_models([models_test[0]], n_sample=N)

#idx = random.choices([0, 1, 2, 3])[0]
#idx_val = list(range(idx*N, (idx+1)*N))
#idx_train = list(list(range(0, idx*N))+ list(range(idx*N, 4*N)))
m = RidgeCV(alphas=alphas, cv=5) #cv=zip(idx_val, idx_val))
m.fit(X_train, Y_train)


In [None]:
m.score(X_test, Y_test)

In [9]:
n_cv = 2
weights = {model: None for model in models}
for i in tqdm(range(len(models_test))):
    m_train, m_test = models_train[i], models_test[i]
    #print('Fitting optimal ridge {}/{}'.format(i, B))
    X_train, Y_train = load_data_models(m_train, n_sample=N)
    X_test, Y_test = load_data_models([m_test], n_sample=N)
    
    #idx = random.choices([0, 1, 2, 3])[0]
    #idx_val = list(range(idx*N, (idx+1)*N))
    #idx_train = list(list(range(0, idx*N))+ list(range(idx*N, 4*N)))
    m = RidgeCV(alphas=alphas, cv=n_cv) #cv=zip(idx_val, idx_val))
    m.fit(X_train, Y_train)
    if weights[m_test] is None:
        weights[m_test] = np.hstack((ridge.coef_/occurence_models_test[m_test], (ridge.intercept_/occurence_models_test[m_test])[:,None]))
    else:
        weights[m_test] += np.hstack((ridge.coef_/occurence_models_test[m_test], (ridge.intercept_/occurence_models_test[m_test])[:,None]))

# Save the weights for this alpha to a file
file_path = f'../weights/Ridge_weights_stability_n{N}.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(weights, f)

  4%|██▏                                                     | 1/25 [01:41<40:29, 101.22s/it]


KeyboardInterrupt: 

In [None]:
scores = []
scores_pattern = np.zeros(d)
file_path = f'../weights/Ridge_weights_stability_n{N}.pkl'
with open(file_path, 'rb') as f:
    weights = pickle.load(f)
        
for model in tqdm(['CanESM5', 'MPI-ESM1-2-LR', 'MIROC6', 'CESM2']):
    X_test, Y_test = load_data_models([model], var='tas')
    A, B = weights[model][:,:-1], weights[model][:,-1]
    Y_pred = X_test @ A.T + B 
    score_pattern = r2_score(Y_test, Y_pred, multioutput='raw_values')
    scores_pattern += score_pattern/len(models)
    scores.append(score_pattern.mean())

In [None]:
plt.boxplot(scores)

In [None]:
scores_pattern_maps = scores_pattern.reshape(len(lat), len(lon))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from matplotlib.colors import TwoSlopeNorm

# Assuming you have defined lat, lon, diff_r2_map, diff_corr_map, max_val_r2, max_val_corr

# Plotting
fig = plt.figure(figsize=(10, 6))
ax = plt.axes(projection=ccrs.Robinson())

# Plot for diff_r2_map
norm_r2 = TwoSlopeNorm(vmin=-1.0, vcenter=0, vmax=1.0)
contour_r2 = ax.pcolormesh(lon, lat, scores_pattern_maps, transform=ccrs.PlateCarree(), cmap='coolwarm', norm=norm_r2)
ax.coastlines()

gl = ax.gridlines(draw_labels=True)
gl.top_labels = gl.right_labels = False
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 12}  # Longitude font size
gl.ylabel_style = {'size': 12}  # Latitude font size

ax.set_title(r'Explained variance of optimal fingerprint (stability)', fontsize=15)

cb = plt.colorbar(contour_r2, ax=ax, label='R2 score differences', orientation='horizontal')
cb.ax.tick_params(labelsize=12)
cb.set_label(r'$R^2$ ', fontsize=12) 

directory = '../Results'
plt.savefig(directory + "/maps_R2_ridge_stability.pdf", format="pdf", bbox_inches="tight")


plt.show()