In [52]:
def get_list_of_files(start = '2012-01-01', stop = '2012-01-31', include_start = True, include_stop = True, var = 'tcc', 
                                        path_input = '/home/hanna/lagrings/ERA5_monthly/'):
    """ Returns list of files containing data for the requested period.

    Parameteres
    ----------------------
    start : str
        Start of period. First day included. (default '2012-01-01')

    stop : str
        end of period. Last day included. (default '2012-01-31')

    Returns
    -----------------------
    subset : List[str]
        List of strings containing all the absolute paths of files containing
        data in the requested period.
    """
    # Remove date.
    parts = start.split('-')
    start_search_str = '{}_{:02d}'.format(parts[0], int(parts[1]))

    if stop is not None:
        parts = stop.split('-')
        stop_search_str = '{}_{:02d}'.format(parts[0], int(parts[1]))
    else:
        stop_search_str = ''

    if (start_search_str == stop_search_str) or (stop is None):
        subset = glob.glob(os.path.join( path_input, '{}*{}*.nc'.format(start_search_str, var)))
    else:
        # get all files
        files = glob.glob(os.path.join( path_input, '*{}*.nc'.format(var) ))
        files = np.sort(files) # sorting then for no particular reson
        
        if path_input == '/home/hanna/lagrings/ERA5_tcc/':
            min_fil = os.path.join(path_input, start_search_str + '_{}_era.nc'.format(var))
            max_fil = os.path.join(path_input, stop_search_str + '_{}_era.nc'.format(var))
        else:
            min_fil = os.path.join(path_input, start_search_str + '_{}.nc'.format(var))
            max_fil = os.path.join(path_input, stop_search_str + '_{}.nc'.format(var))
            
        if include_start and include_stop:
            smaller = files[files <= max_fil]
            subset  = smaller[smaller >= min_fil] # results in all the files

        elif include_start and not include_stop:
            smaller = files[files < max_fil]
            subset  = smaller[smaller >= min_fil] # results in all the files

        elif not include_start and include_stop:
            smaller = files[files <= max_fil]
            subset  = smaller[smaller > min_fil] # results in all the files
        else:
            raise ValueError('Something wierd happend. ')
    return subset



In [53]:
def mean_squared_error(y_true, y_pred):
    """Computes the Mean Squared Error score metric.

    Parameteres
    ------------------
    y_true : array-like
        Actual vales of y.
    y_pred : array-like
        Predicted values of y.

    Returns
    -------------------
    mse : float
        mean squared error
    """
    mse = np.nanmean(np.square(np.subtract(y_true, y_pred)), axis = 0)
    return mse


def accumulated_squared_error(y_true, y_pred):
    """Computes the Mean Squared Error score metric.

    Parameteres
    ----------------
    y_true : array-like
        Actual vales of y.
    y_pred : array-like
        Predicted values of y.

    Returns
    ----------------
    ase : float
        Accumulated squared error between y_true and y_pred.
    """
    ase = np.nansum(np.square(np.subtract(y_true, y_pred)), axis = 0)
    return ase


def r2_score(y_true, y_pred):
    """ Computes the R2 score score metric.

    Parameteres
    ---------------------------
    y_true : array-like
        Actual vales of y.
    y_pred : array-like
        Predicted values of y.

    Returns
    ----------------------------
    r2 : float
         Coefficient of determination.

    Notes
    -----------
    Describes variation of data captured by the model.
    """
    numerator   = np.nansum(np.square(np.subtract(y_true, y_pred)), axis=0)
    denominator = np.nansum(np.square(np.subtract(y_true, np.nanmean(y_true))), axis = 0)
    val = numerator/denominator
    return 1 - val


In [54]:
read_era5 = '/home/hanna/lagrings/ERA5_tcc/'
read_tcc  = '/home/hanna/lagrings/ERA5_monthly/'

import os 
import glob

import numpy as np
import xarray as xr

from sclouds.io.utils import merge

array = [('2004-04-01', '2008-12-31'),
          ('2009-01-01', '2013-12-31'),
          ('2014-01-01', '2018-12-31')]

# Load Data
start, stop = '2009-04-01', '2009-05-31'
files_tcc =  get_list_of_files(start  = start, stop = stop, include_start = True, include_stop = True, var = 'tcc', 
                                        path_input = '/home/hanna/lagrings/ERA5_monthly/')
files_era5 =  get_list_of_files(start  = start, stop = stop, include_start = True, include_stop = True, var = 'tcc', 
                                        path_input = '/home/hanna/lagrings/ERA5_tcc/')



In [55]:
files_tcc

array(['/home/hanna/lagrings/ERA5_monthly/2009_04_tcc.nc',
       '/home/hanna/lagrings/ERA5_monthly/2009_05_tcc.nc'], dtype='<U48')

In [56]:
files_r =  get_list_of_files(start = start, stop = stop, include_start = True, include_stop = True, var = 'r', 
                                        path_input = '/home/hanna/lagrings/ERA5_monthly/')

In [57]:
files_r

array(['/home/hanna/lagrings/ERA5_monthly/2009_04_r.nc',
       '/home/hanna/lagrings/ERA5_monthly/2009_05_r.nc'], dtype='<U46')

In [58]:
files_era5

array(['/home/hanna/lagrings/ERA5_tcc/2009_04_tcc_era.nc',
       '/home/hanna/lagrings/ERA5_tcc/2009_05_tcc_era.nc'], dtype='<U48')

In [59]:
true_data = merge(np.concatenate([files_tcc, files_r ]))
era5_data = merge(files_era5)
data = xr.merge([era5_data.rename({'tcc':'era'}), true_data])

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  return xr.open_mfdataset(files, compat='no_conflicts') # , join='outer'
to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).The datasets supplied require both concatenation and merging. From
xarray version 0.15 this will operation will require either using the
new `combine_nested` function (or the `combine='nested'` option to
open_mfdataset), with a nested list structure such that you can combine
along the dimensions None. Alternatively if your datasets have global
dimension coordinates then you can use th

In [68]:
data = xr.merge([era5_data.rename({'tcc':'era'}), true_data])

Unnamed: 0,Array,Chunk
Bytes,11.71 kB,5.95 kB
Shape,"(1464,)","(744,)"
Count,18 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 11.71 kB 5.95 kB Shape (1464,) (744,) Count 18 Tasks 2 Chunks Type datetime64[ns] numpy.ndarray",1464  1,

Unnamed: 0,Array,Chunk
Bytes,11.71 kB,5.95 kB
Shape,"(1464,)","(744,)"
Count,18 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,8 Tasks,2 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 76.37 MB 38.81 MB Shape (1464, 81, 161) (744, 81, 161) Count 8 Tasks 2 Chunks Type float32 numpy.ndarray",161  81  1464,

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,8 Tasks,2 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,152.74 MB,77.62 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,25 Tasks,2 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 152.74 MB 77.62 MB Shape (1464, 81, 161) (744, 81, 161) Count 25 Tasks 2 Chunks Type float64 numpy.ndarray",161  81  1464,

Unnamed: 0,Array,Chunk
Bytes,152.74 MB,77.62 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,25 Tasks,2 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,152.74 MB,77.62 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,25 Tasks,2 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 152.74 MB 77.62 MB Shape (1464, 81, 161) (744, 81, 161) Count 25 Tasks 2 Chunks Type float64 numpy.ndarray",161  81  1464,

Unnamed: 0,Array,Chunk
Bytes,152.74 MB,77.62 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,25 Tasks,2 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,8 Tasks,2 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 76.37 MB 38.81 MB Shape (1464, 81, 161) (744, 81, 161) Count 8 Tasks 2 Chunks Type float32 numpy.ndarray",161  81  1464,

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,8 Tasks,2 Chunks
Type,float32,numpy.ndarray


In [63]:
era5_data.rename({'tcc':'era'})

Unnamed: 0,Array,Chunk
Bytes,11.71 kB,5.95 kB
Shape,"(1464,)","(744,)"
Count,6 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 11.71 kB 5.95 kB Shape (1464,) (744,) Count 6 Tasks 2 Chunks Type datetime64[ns] numpy.ndarray",1464  1,

Unnamed: 0,Array,Chunk
Bytes,11.71 kB,5.95 kB
Shape,"(1464,)","(744,)"
Count,6 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,6 Tasks,2 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 76.37 MB 38.81 MB Shape (1464, 81, 161) (744, 81, 161) Count 6 Tasks 2 Chunks Type float32 numpy.ndarray",161  81  1464,

Unnamed: 0,Array,Chunk
Bytes,76.37 MB,38.81 MB
Shape,"(1464, 81, 161)","(744, 81, 161)"
Count,6 Tasks,2 Chunks
Type,float32,numpy.ndarray
