# Make Dataset
7th August 2023

Make dataset from csv run ouputs.

The dataset already includes:
* time
* latitude
* longitude
* elevation

Outputs need to be transformed using Box-Cox lambdas to include:
* Transformed mean posterior predictions for the high fidelity level
* Upper bound of the transformed posterior distribution for the high fidelity level
* Lower bound of the transformed posterior distribution for the high fidelity level

The data also needs to be converted to a netcdf file and include the appropriate metadata.

In [1]:
import pandas as pd
import scipy as sp
import numpy as np
import glob
import xarray as xr

## For 1 year

In [2]:
df = pd.read_csv('outputs/priors_lat_lon_mat52/preds_1980_1981.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,time,lon,lat,elevation,pred0,pred_low0,y_var0,y_var_low0
0,1980.041667,81.90625,30.28125,5383.0,0.314634,0.289611,2.243733,0.606565
1,1980.041667,81.96875,30.28125,5071.0,0.254223,0.234005,2.151837,0.528705
2,1980.041667,81.46875,30.34375,5446.0,0.133395,0.122786,2.498563,0.822474
3,1980.041667,81.53125,30.34375,5237.0,0.019359,0.017819,2.273537,0.631817
4,1980.041667,81.71875,30.34375,4978.0,-0.064907,-0.059745,2.139734,0.51845


In [34]:
scaling_df = pd.read_csv('dataset_w_metadata/lambdas_1980_2010.csv', index_col=0)

In [35]:
scaling_df.head()

Unnamed: 0,year,lambdas
0,1980,0.058104
1,1981,0.120056
2,1982,0.12986
3,1983,0.135035
4,1984,0.111865


In [39]:
lambdas = scaling_df[['lambdas']]

In [40]:
lambdas

Unnamed: 0,lambdas
0,0.058104
1,0.120056
2,0.12986
3,0.135035
4,0.111865
5,0.086285
6,0.113906
7,0.182353
8,0.095554
9,0.059386


In [41]:
df['pred_tr'] = sp.special.inv_boxcox(df['pred0'], lambdas.iloc[0].values)

In [42]:
lambdas.iloc[0].values

array([0.05810437])

In [43]:
df.head()

Unnamed: 0,time,lon,lat,elevation,pred0,pred_low0,y_var0,y_var_low0,pred_tr,pred_tr_CI_upper,pred_tr_CI_lower
0,2009.041667,81.90625,30.28125,5383.0,-0.409132,-0.39683,2.024798,0.615223,0.660952,9.29118,0.029068
1,2009.041667,81.96875,30.28125,5071.0,-0.518415,-0.502826,1.959358,0.553659,0.590737,8.105959,0.026871
2,2009.041667,81.46875,30.34375,5446.0,-0.419634,-0.407015,2.217617,0.79662,0.653878,10.314323,0.024446
3,2009.041667,81.53125,30.34375,5237.0,-0.564382,-0.547411,2.052127,0.640933,0.563356,8.237822,0.023443
4,2009.041667,81.71875,30.34375,4978.0,-0.634633,-0.615549,1.953001,0.547679,0.523808,7.282034,0.023397


In [9]:
pred_CI_upper = df['pred0'] + 1.96 * np.sqrt(df['y_var0'])
df['pred_tr_CI_upper'] = sp.special.inv_boxcox(pred_CI_upper, lambdas.iloc[0].values)

In [10]:
pred_CI_lower = df['pred0'] - 1.96 * np.sqrt(df['y_var0'])
df['pred_tr_CI_lower'] = sp.special.inv_boxcox(pred_CI_lower, lambdas.iloc[0].values)

## For all years

In [44]:
directory = 'outputs/priors_lat_lon_mat52/*'
paths = glob.glob(directory)

In [45]:
i = 0

for path in paths:
    print(path)
    df = pd.read_csv(path, index_col=0)
    
    # Transformed mean
    df['pred_tr'] = sp.special.inv_boxcox(df['pred0'], lambdas.iloc[i].values)
    i =+ 1
    
    # Transformed upper confidence interval
    pred_CI_upper = df['pred0'] + 1.96 * np.sqrt(df['y_var0'])
    df['pred_tr_CI_upper'] = sp.special.inv_boxcox(pred_CI_upper, lambdas.iloc[0].values)
    
    # Transformed lower confidence interval
    pred_CI_lower = df['pred0'] - 1.96 * np.sqrt(df['y_var0'])
    df['pred_tr_CI_lower'] = sp.special.inv_boxcox(pred_CI_lower, lambdas.iloc[0].values)
    
    p = path.split('/')
    df.to_csv(p[0] + '/tr_'+ p[1] + '/' + p[2])

outputs/priors_lat_lon_mat52/preds_1980_1981.csv
outputs/priors_lat_lon_mat52/preds_1981_1982.csv
outputs/priors_lat_lon_mat52/preds_1982_1983.csv
outputs/priors_lat_lon_mat52/preds_1983_1984.csv
outputs/priors_lat_lon_mat52/preds_1984_1985.csv
outputs/priors_lat_lon_mat52/preds_1985_1986.csv
outputs/priors_lat_lon_mat52/preds_1986_1987.csv
outputs/priors_lat_lon_mat52/preds_1987_1988.csv
outputs/priors_lat_lon_mat52/preds_1988_1989.csv
outputs/priors_lat_lon_mat52/preds_1989_1990.csv
outputs/priors_lat_lon_mat52/preds_1990_1991.csv
outputs/priors_lat_lon_mat52/preds_1991_1992.csv
outputs/priors_lat_lon_mat52/preds_1992_1993.csv
outputs/priors_lat_lon_mat52/preds_1993_1994.csv
outputs/priors_lat_lon_mat52/preds_1994_1995.csv
outputs/priors_lat_lon_mat52/preds_1995_1996.csv
outputs/priors_lat_lon_mat52/preds_1996_1997.csv
outputs/priors_lat_lon_mat52/preds_1997_1998.csv
outputs/priors_lat_lon_mat52/preds_1998_1999.csv
outputs/priors_lat_lon_mat52/preds_1999_2000.csv
outputs/priors_lat_l

## To netcdf

In [126]:
tr_directory = 'outputs/tr_priors_lat_lon_mat52/*'
tr_paths = glob.glob(tr_directory)
tr_df_all = pd.concat(map(pd.read_csv, tr_paths))

In [127]:
tr_df_all.drop(columns=['Unnamed: 0'], inplace=True)

In [128]:
tr_df_all.rename(columns={'elevation':'elev'}, inplace=True)

In [129]:
tr_df_all['time'] = pd.to_datetime((tr_df_all['time']-1970)*365*24*60*60*1e9).astype("datetime64[M]")

In [130]:
tr_df_all.isna().sum()

time                0
lon                 0
lat                 0
elev                0
pred0               0
pred_low0           0
y_var0              0
y_var_low0          0
pred_tr             0
pred_tr_CI_upper    0
pred_tr_CI_lower    0
dtype: int64

In [125]:
tr_df_all

Unnamed: 0,time,lat,lon,elev,pred0,pred_low0,y_var0,y_var_low0,pred_tr,pred_tr_CI_upper,pred_tr_CI_lower
0,1980-01-01,30.28125,81.90625,5383.0,0.314634,0.289611,2.243733,0.606565,1.365871,19.637924,0.058204
1,1980-01-01,30.28125,81.96875,5071.0,0.254223,0.234005,2.151837,0.528705,1.287064,17.729793,0.058228
2,1980-01-01,30.34375,81.46875,5446.0,0.133395,0.122786,2.498563,0.822474,1.142114,19.326433,0.038626
3,1980-01-01,30.34375,81.53125,5237.0,0.019359,0.017819,2.273537,0.631817,1.019536,15.546966,0.039991
4,1980-01-01,30.34375,81.71875,4978.0,-0.064907,-0.059745,2.139734,0.518450,0.937040,13.411510,0.040185
...,...,...,...,...,...,...,...,...,...,...,...
644035,2009-12-01,32.84375,78.53125,4833.0,-0.560862,-0.543997,2.003458,0.595147,0.559541,8.022076,0.024546
644036,2009-12-01,32.84375,78.59375,5025.0,-0.657082,-0.637323,1.944016,0.539226,0.504368,7.097675,0.022931
644037,2009-12-01,32.84375,78.65625,5352.0,-0.614497,-0.596019,2.101486,0.687368,0.528166,8.118042,0.021119
644038,2009-12-01,32.90625,78.40625,5257.0,-0.475838,-0.461530,1.977175,0.570421,0.612644,8.510102,0.027888


In [118]:
tr_df_all.to_csv('dataset_w_metadata/mfgp_predictions_1980_2010.csv')

In [75]:
tr_df_all.set_index(['time', 'lat', 'lon'], inplace=True)

In [119]:
ds = tr_df_all.to_xarray()

In [120]:
ds

In [121]:
ds.attrs['title'] = 'Downscaled ERA5 monthly precipitation data using Multi-Fidelity Gaussian Processes between 1980 and 2010 for the Upper Beas and Sutlej Basins, Himalayas'   

In [122]:
ds.attrs['institutions'] = 'University of Cambridge, British Antactic Survey'

In [50]:
#ds.attrs['source'] = 'Statiscally downscaled climate reanalysis data'

In [123]:
ds.to_netcdf('dataset_w_metadata/mfgp_predictions_1980_2010.nc')

In [61]:
list(ds)

['elev',
 'pred0',
 'pred_low0',
 'y_var0',
 'y_var_low0',
 'pred_tr',
 'pred_tr_CI_upper',
 'pred_tr_CI_lower']

In [6]:
import pandas as pd
import numpy as np
import scipy as sp

# Read in the data
df = pd.read_csv('dataset_w_metadata/mfgp_predictions_1980_2010.csv', index_col=0)

# Read in the scaling factors
scaling_df = pd.read_csv(
    'dataset_w_metadata/lambdas_1980_2010.csv', index_col=0)

# For 1980
df_subset = df.set_index('time')['1980-01-01' :'1981-01-01']


In [15]:
lambda_1980 = scaling_df[scaling_df['year'] == 1980]['lambdas'].values[0]

In [14]:
lambda_1980.values[0]

0.0581043744313327

In [16]:
# Mean
df_subset['pred_tr'] = sp.special.inv_boxcox(df_subset['pred0'], lambda_1980)

# Upper 95% confidence interval bound
pred_CI_upper = df_subset['pred0'] + 1.96 * np.sqrt(df_subset['y_var0'])
df_subset['pred_tr_CI_upper'] = sp.special.inv_boxcox(pred_CI_upper, lambda_1980)

# Lower 95% confidence interval bound
pred_CI_lower = df_subset['pred0'] - 1.96 * np.sqrt(df_subset['y_var0'])
df_subset['pred_tr_CI_lower'] = sp.special.inv_boxcox(pred_CI_lower, lambda_1980)

In [17]:
df_subset

Unnamed: 0_level_0,lat,lon,elev,pred0,pred_low0,y_var0,y_var_low0,pred_tr,pred_tr_CI_upper,pred_tr_CI_lower
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-01-01,30.28125,81.90625,5383.0,0.314634,0.289611,2.243733,0.606565,1.365871,19.637924,0.058204
1980-01-01,30.28125,81.96875,5071.0,0.254223,0.234005,2.151837,0.528705,1.287064,17.729793,0.058228
1980-01-01,30.34375,81.46875,5446.0,0.133395,0.122786,2.498563,0.822474,1.142114,19.326433,0.038626
1980-01-01,30.34375,81.53125,5237.0,0.019359,0.017819,2.273537,0.631817,1.019536,15.546966,0.039991
1980-01-01,30.34375,81.71875,4978.0,-0.064907,-0.059745,2.139734,0.518450,0.937040,13.411510,0.040185
...,...,...,...,...,...,...,...,...,...,...
1981-01-01,32.84375,78.53125,4833.0,-0.375613,-0.337777,2.462890,1.162913,0.684017,12.284404,0.021232
1981-01-01,32.84375,78.59375,5025.0,-0.466231,-0.419267,2.389821,1.103823,0.623340,10.911946,0.020077
1981-01-01,32.84375,78.65625,5352.0,-0.442951,-0.398332,2.549374,1.232851,0.638425,12.138703,0.018240
1981-01-01,32.90625,78.40625,5257.0,-0.369841,-0.332587,2.430820,1.136978,0.688064,12.133227,0.021930
