In [1]:
from matplotlib import pyplot as plt

import xarray as xr
import netCDF4 as nc
import numpy as np

import os

import datetime as dt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.metrics import r2_score
import random
from tqdm import tqdm
from collections import Counter

In [2]:
models = ['CanESM5', 'MIROC-ES2L', 'MPI-ESM1-2-LR', 'MIROC6', 'CESM2']

In [3]:
def load_model_data(model, var, path='../data/'):
    # Loading data file
    file_path = os.path.join(path, '{}_{}.nc'.format(model, var))
    ds = xr.open_dataset(file_path)
    # Getting TAS
    tas_array = ds[var].values
    # Close the dataset
    ds.close()
    return tas_array

def load_data_models(models, var='tas', n_sample=10, path='../data/'):
    X, y = None, None
    for model in models:
        tas_array = load_model_data(model, var=var, path=path)
        shape = tas_array.shape
        
        idxs = random.sample(range(shape[0]*shape[1]), n_sample)
        X_temp = tas_array.reshape(shape[0]*shape[1], shape[2]*shape[3])
        y_temp = np.tile(tas_array.mean(axis=0), (shape[0], 1, 1)) .reshape(shape[0]*shape[1], shape[2]*shape[3])

        if X is None:
            X = X_temp[idxs,:]
            y = y_temp[idxs,:]
        else :
            X = np.vstack((X, X_temp[idxs,:]))
            y = np.vstack((y, y_temp[idxs,:]))
        del tas_array
        del X_temp
        del y_temp
    return X, y

In [None]:
B = 20
models_test = random.choices(models, k=B)
models_train = [[model for model in models if model != model_test ] for model_test in models_test ]

In [None]:
occurence_models_test = Counter(models_test)

In [None]:
occurence_models_test

In [5]:
N = 2500
n_alpha = 10
alphas = np.logspace(2, 6, n_alpha)

In [None]:
for alpha in tqdm(alphas):
    weights = {model: None for model in models}
    for m_train, m_test in zip(models_train, models_test):
        print(m_test)
        X_train, Y_train = load_data_models(m_train, n_sample=N)
        X_test, Y_test = load_data_models([m_test], n_sample=N)
        ridge = Ridge(alpha)
        ridge.fit(X_train, Y_train)
        if weights[m_test] is None:
            weights[m_test] = np.hstack((ridge.coef_/occurence_models_test[m_test], (ridge.intercept_/occurence_models_test[m_test])[:,None]))
        else:
            weights[m_test] += np.hstack((ridge.coef_/occurence_models_test[m_test], (ridge.intercept_/occurence_models_test[m_test])[:,None]))
    
    # Save the weights for this alpha to a file
    file_path = f'../weights/Ridge_weights_alpha_{alpha}_n{N}.pkl'
    with open(file_path, 'wb') as f:
        pickle.dump(weights, f)

In [None]:
N=500
scores = {}
scores_pattern = {}
for alpha in tqdm(alphas[3:]):
    scores[alpha] = []
    scores_pattern[alpha] = np.zeros(X_test.shape[1])
    for model in models:
        file_path = f'../weights/Ridge_weights_alpha_{alpha}_n{N}.pkl'
        try:
            with open(file_path, 'rb') as f:
                weights = pickle.load(f)
            X_test, Y_test = load_data_models([model], var='tas')
            A, B = weights[model][:,:-1], weights[model][:,-1]
            Y_pred = X_test @ A.T + B 
            score_pattern = r2_score(Y_test, Y_pred, multioutput='raw_values')
            scores_pattern[alpha] += score_pattern/len(models)
            scores[alpha].append(score_pattern.mean())
        except FileNotFoundError:
            print(f"File for alpha={alpha} and model={model} not found.")


  0%|                                                               | 0/7 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt

# Extract scores for each alpha
data = [scores[alpha] for alpha in alphas]

# Boxplot
plt.boxplot(data)

# Set xticklabels with alpha values
plt.xticks(range(1, len(alphas) + 1), [f'{alpha:.0f}' for alpha in alphas], rotation=45)

# Set xlabel with alphas
plt.xlabel(r'$\alpha$')

# Set ylabel with r2 score
plt.ylabel(r'$R^2$ Score')

plt.show()


In [None]:
mean_scores = {alpha: np.mean(scores[alpha]) for alpha in alphas}

In [None]:
alpha_opt = alphas[np.argmax(list(mean_scores.values()))]