In [1]:
import sys
sys.path.append('/Users/kenzatazi/Documents/CDT/Code')

from load import era5, data_dir, value
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import GPy
import scipy as sp
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from mfdgp.mfdgp.utils.metrics import msll

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
import emukit
from emukit.model_wrappers.gpy_model_wrappers import GPyMultiOutputWrapper
from emukit.multi_fidelity.models import GPyLinearMultiFidelityModel
from emukit.multi_fidelity.convert_lists_to_array import convert_x_list_to_array, convert_xy_lists_to_arrays

In [4]:
# Load data
minyear = 2000
maxyear = 2005

gauge_df = value.all_gauge_data(minyear, maxyear, monthly=True)
station_names = gauge_df.drop_duplicates('name')['name']

# Get CV scheme
cv_locs = np.load('/Users/kenzatazi/Documents/CDT/Code/mfdgp/mfdgp/experiments/exp1/exp1_cv_locs.npy')
cv_locs = cv_locs.reshape(-1, 2)

station_list = []
for loc in cv_locs:
    station_row = gauge_df[(gauge_df['latitude'] == loc[1]) | (gauge_df['longitude'] == loc[0])].iloc[0]
    station_list.append(station_row['name'])
station_arr = np.array(station_list)

# Split indexes
kf = KFold(n_splits=5)

cv_train_list = []
cv_test_list = []

for train_index, test_index in kf.split(station_arr):
    hf_train, hf_test = station_arr[train_index], station_arr[test_index]
    cv_train_list.append(hf_train)
    cv_test_list.append(hf_test)

In [None]:
# MFDGP

R2_all = []
RMSE_all = []
RMSE_p5 = []
RMSE_p95 = []
MSLL = []

R2_all_low = []
RMSE_all_low = []
RMSE_p5_low = []
RMSE_p95_low = []
MSLL_low = []


for i in range(len(cv_train_list)):

    hf_train_list = []
    for station in cv_train_list[i]:
        station_ds = value.gauge_download(
            station, minyear=minyear, maxyear=maxyear)
        hf_train_list.append(station_ds.dropna().reset_index())
    hf_train_df = pd.concat(hf_train_list)

    val_list = []
    for station in cv_test_list[i]:
        station_ds = value.gauge_download(
            station, minyear=minyear, maxyear=maxyear)
        val_list.append(station_ds.dropna().reset_index())
    val_df = pd.concat(val_list)

    era5_df = era5.value_gauge_download(
        list(cv_test_list[i]) + list(cv_train_list[i]), minyear=minyear, maxyear=maxyear)

    lf_train_df = era5_df.reset_index()
    
    # Prepare data
    
    # Transformations
    lf_train_df['tp_tr'], lf_lambda = sp.stats.boxcox(
        lf_train_df['tp'].values + 0.01)
    hf_train_df['tp_tr'] = sp.stats.boxcox(
        hf_train_df['tp'].values + 0.01, lmbda=lf_lambda)
    val_df['tp_tr'] = sp.stats.boxcox(
        val_df['tp'].values + 0.01, lmbda=lf_lambda)

    # Splitting
    x_train_lf = lf_train_df[['time', 'lat', 'lon', 'z']].values.reshape(-1, 4)
    y_train_lf = lf_train_df['tp_tr'].values.reshape(-1, 1)
    x_train_hf = hf_train_df[['time', 'latitude', 'longitude', 'altitude']].values.reshape(-1, 4)
    y_train_hf = hf_train_df[['tp_tr']].values.reshape(-1, 1)
    x_val = val_df[['time', 'latitude', 'longitude', 'altitude']].values.reshape(-1, 4)
    y_val = val_df['tp_tr'].values.reshape(-1, 1)

    # Scaling
    scaler = StandardScaler().fit(x_train_hf)
    x_train_hf1 = scaler.transform(x_train_hf)
    x_train_lf1 = scaler.transform(x_train_lf)
    x_val1 = scaler.transform(x_val)
    
    # Input data
    X_train, Y_train = convert_xy_lists_to_arrays([x_train_lf1[:], x_train_hf1[:]], [y_train_lf[:], y_train_hf[:]])
    
    # Train and evaluate
    kern1 = GPy.kern.RBF(input_dim=4, ARD=True)
    kernels = [kern1, GPy.kern.RBF(input_dim=4, ARD=True)]
    lin_mf_kernel = emukit.multi_fidelity.kernels.LinearMultiFidelityKernel(kernels)
    gpy_lin_mf_model = GPyLinearMultiFidelityModel(X_train, Y_train, lin_mf_kernel, n_fidelities=2,)
    gpy_lin_mf_model.mixed_noise.Gaussian_noise.fix(0)
    gpy_lin_mf_model.mixed_noise.Gaussian_noise_1.fix(0)
    lin_mf_model = GPyMultiOutputWrapper(gpy_lin_mf_model, 2, n_optimization_restarts=5)
    lin_mf_model.optimize()

    # ALL
    n = x_val.shape[0]
    x_met = convert_x_list_to_array([x_val1, x_val1])
    y_pred0, y_var0 = lin_mf_model.predict(x_met[n:])
    y_pred_low0, y_var_low0 = lin_mf_model.predict(x_met[:n])
    
    # ALL
    y_pred = sp.special.inv_boxcox(y_pred0, lf_lambda).reshape(-1)
    y_true = sp.special.inv_boxcox(y_val, lf_lambda).reshape(-1)
    R2_all.append(r2_score(y_true, y_pred))
    RMSE_all.append(mean_squared_error(y_true, y_pred, squared=False))
    
    y_pred_low = sp.special.inv_boxcox(y_pred_low0, lf_lambda).reshape(-1)
    R2_all_low.append(r2_score(y_true, y_pred_low))
    RMSE_all_low.append(mean_squared_error(y_true, y_pred_low, squared=False))

    # 5th PERCENTILE
    p5 = np.percentile(y_true, 5.0)
    indx = [y_true <= p5][0]
    x_val_p5 = x_val[indx, :]
    y_true_p5 = y_true[indx]
    y_pred_p5 = y_pred[indx]
    y_pred_p5_low = y_pred_low[indx]
    RMSE_p5.append(mean_squared_error(y_true_p5, y_pred_p5, squared=False))
    RMSE_p5_low.append(mean_squared_error(y_true_p5, y_pred_p5_low, squared=False))

    # 95th PERCENTILE
    p95 = np.percentile(y_true, 95.0)
    indx = [y_true >= p95][0]
    x_val_p95 = x_val[indx]
    y_true_p95 = y_true[indx]
    y_pred_p95 = y_pred[indx]
    y_pred_p95_low = y_pred_low[indx]
    RMSE_p95.append(mean_squared_error(y_true_p95, y_pred_p95, squared=False))
    RMSE_p95_low.append(mean_squared_error(y_true_p95, y_pred_p95_low, squared=False))
                        
    # MSLL
    ll = msll(y_val, y_pred0, y_var0)
    ll_low = msll(y_val, y_pred_low0, y_var_low0)
    MSLL.append(ll)
    MSLL_low.append(ll_low)

print('Mean RMSE = ', np.mean(RMSE_all), '±', np.std(RMSE_all))
print('Mean R2 = ', np.mean(R2_all), '±', np.std(R2_all))
print('5th RMSE = ', np.mean(RMSE_p5), '±', np.std(RMSE_p5))
print('95th RMSE = ', np.mean(RMSE_p95), '±', np.std(RMSE_p95))
print('MSLL= ', np.mean(MSLL), '±', np.std(MSLL))
                        
print('Mean RMSE = ', np.mean(RMSE_all_low), '±', np.std(RMSE_all_low))
print('Mean R2 = ', np.mean(R2_all_low), '±', np.std(R2_all_low))
print('5th RMSE = ', np.mean(RMSE_p5_low), '±', np.std(RMSE_p5_low))
print('95th RMSE = ', np.mean(RMSE_p95_low), '±', np.std(RMSE_p95_low))
print('MSLL= ', np.mean(MSLL_low), '±', np.std(MSLL_low))


value
/Users/kenzatazi/Documents/CDT/Code/data/ERA5/combi_data_value_02-2023.csv




Optimization restart 1/5, f = 4338.868259675905
Optimization restart 2/5, f = 3938.1830893083866
Optimization restart 3/5, f = 16680.705132164818
Optimization restart 4/5, f = 3938.1830898061535




Optimization restart 5/5, f = 185330.74979393266
value
/Users/kenzatazi/Documents/CDT/Code/data/ERA5/combi_data_value_02-2023.csv




Optimization restart 1/5, f = 757545987.1626427
Optimization restart 2/5, f = 3907.330487772175
Optimization restart 3/5, f = 3840.932978295172




Optimization restart 4/5, f = 7204.123231861449


In [None]:
np.savetxt('table1_ypred_lf_r2_2000-2005.csv',R2_all_low)
np.savetxt('table1_ypred_hf_r2_2000-2005.csv', R2_all)

In [None]:
# GP and linear regression

R2_all = []
RMSE_all = []
RMSE_p5 = []
RMSE_p95 = []
MSLL = []

R2_all_low = []
RMSE_all_low = []
RMSE_p5_low = []
RMSE_p95_low = []
MSLL_low = []


for i in range(len(cv_train_list)):

    hf_train_list = []
    for station in cv_train_list[i]:
        station_ds = value.gauge_download(
            station, minyear=minyear, maxyear=maxyear)
        hf_train_list.append(station_ds.dropna().reset_index())
    hf_train_df = pd.concat(hf_train_list)

    val_list = []
    for station in cv_test_list[i]:
        station_ds = value.gauge_download(
            station, minyear=minyear, maxyear=maxyear)
        val_list.append(station_ds.dropna().reset_index())
    val_df = pd.concat(val_list)

    era5_df = era5.value_gauge_download(
        list(cv_test_list[i]) + list(cv_train_list[i]), minyear=minyear, maxyear=maxyear)

    lf_train_df = era5_df.reset_index()
    
    # Prepare data
    
    # Transformations
    lf_train_df['tp_tr'], lf_lambda = sp.stats.boxcox(
        lf_train_df['tp'].values + 0.01)
    hf_train_df['tp_tr'] = sp.stats.boxcox(
        hf_train_df['tp'].values + 0.01, lmbda=lf_lambda)
    val_df['tp_tr'] = sp.stats.boxcox(
        val_df['tp'].values + 0.01, lmbda=lf_lambda)

    # Splitting
    x_train_lf = lf_train_df[['time', 'lat', 'lon', 'z']].values.reshape(-1, 4)
    y_train_lf = lf_train_df['tp_tr'].values.reshape(-1, 1)
    x_train_hf = hf_train_df[['time', 'latitude', 'longitude', 'altitude']].values.reshape(-1, 4)
    y_train_hf = hf_train_df[['tp_tr']].values.reshape(-1, 1)
    x_val = val_df[['time', 'latitude', 'longitude', 'altitude']].values.reshape(-1, 4)
    y_val = val_df['tp_tr'].values.reshape(-1, 1)

    # Scaling
    scaler = StandardScaler().fit(x_train_hf)
    x_train_hf1 = scaler.transform(x_train_hf)
    x_train_lf1 = scaler.transform(x_train_lf)
    x_val1 = scaler.transform(x_val)
    
    # Input data
    kernel = GPy.kern.StdPeriodic(1, active_dims=[0], period=1) * GPy.kern.RBF(1, active_dims=[0]) + GPy.kern.RBF(3, active_dims=[1,2,3], ARD=True)
    m = GPy.models.GPRegression(x_train_h1, y_train_h, kernel)
    m.optimize_restarts(num_restarts = 5)
    
    # Train and evaluate
    linear_m = LinearRegression()
    linear_m.fit(x_train_lf, y_train_lf)

    # ALL
    n = x_val.shape[0]
    y_pred0 = linear_m.predict(x_val1)
    y_pred_low0, y_var_low0 = m.predict(x_val1)
    
    # ALL
    y_pred = sp.special.inv_boxcox(y_pred0, lf_lambda).reshape(-1)
    y_true = sp.special.inv_boxcox(y_val, lf_lambda).reshape(-1)
    R2_all.append(r2_score(y_true, y_pred))
    RMSE_all.append(mean_squared_error(y_true, y_pred, squared=False))
    
    y_pred_low = sp.special.inv_boxcox(y_pred_low0, lf_lambda).reshape(-1)
    R2_all_low.append(r2_score(y_true, y_pred_low))
    RMSE_all_low.append(mean_squared_error(y_true, y_pred_low, squared=False))

    # 5th PERCENTILE
    p5 = np.percentile(y_true, 5.0)
    indx = [y_true <= p5][0]
    x_val_p5 = x_val[indx, :]
    y_true_p5 = y_true[indx]
    y_pred_p5 = y_pred[indx]
    y_pred_p5_low = y_pred_low[indx]
    RMSE_p5.append(mean_squared_error(y_true_p5, y_pred_p5, squared=False))
    RMSE_p5_low.append(mean_squared_error(y_true_p5, y_pred_p5_low, squared=False))

    # 95th PERCENTILE
    p95 = np.percentile(y_true, 95.0)
    indx = [y_true >= p95][0]
    x_val_p95 = x_val[indx]
    y_true_p95 = y_true[indx]
    y_pred_p95 = y_pred[indx]
    y_pred_p95_low = y_pred_low[indx]
    RMSE_p95.append(mean_squared_error(y_true_p95, y_pred_p95, squared=False))
    RMSE_p95_low.append(mean_squared_error(y_true_p95, y_pred_p95_low, squared=False))
                        
    # MSLL
    ll = msll(y_val, y_pred0, y_var0)
    ll_low = msll(y_val, y_pred_low0, y_var_low0)
    MSLL.append(ll)
    MSLL_low.append(ll_low)

print('Lin Reg')
print('Mean RMSE = ', np.mean(RMSE_all), '±', np.std(RMSE_all))
print('Mean R2 = ', np.mean(R2_all), '±', np.std(R2_all))
print('5th RMSE = ', np.mean(RMSE_p5), '±', np.std(RMSE_p5))
print('95th RMSE = ', np.mean(RMSE_p95), '±', np.std(RMSE_p95))
print('MSLL= ', np.mean(MSLL), '±', np.std(MSLL))

print('GP')            
print('Mean RMSE = ', np.mean(RMSE_all_low), '±', np.std(RMSE_all_low))
print('Mean R2 = ', np.mean(R2_all_low), '±', np.std(R2_all_low))
print('5th RMSE = ', np.mean(RMSE_p5_low), '±', np.std(RMSE_p5_low))
print('95th RMSE = ', np.mean(RMSE_p95_low), '±', np.std(RMSE_p95_low))
print('MSLL= ', np.mean(MSLL_low), '±', np.std(MSLL_low))


In [None]:
ical