# Data Preparation

In [1]:
import netCDF4
import xarray as xr
import datetime

In [2]:
path_file = "../data/train/2016_12_ERA5.nc"

In [3]:
ds = xr.open_dataset(path_file)

In [48]:
(ds['time'])

In [5]:
ds.to_array().values

array([[[[ 6.43769503e-01,  1.15743065e+00,  2.22177410e+00, ...,
          -8.36019516e-02,  1.50721788e-01,  2.91231871e-01],
         [ 7.34638214e-01,  1.08212733e+00,  2.73459387e+00, ...,
           3.05956125e-01,  4.21839952e-02, -4.65813875e-02],
         [ 8.81037951e-01,  1.32065821e+00,  2.81073856e+00, ...,
           8.33079576e-01,  6.26100540e-01,  1.35997653e-01],
         ...,
         [ 1.52847838e+00,  1.85282969e+00,  2.02951908e+00, ...,
           8.41913939e-01,  1.62501097e-01, -4.33194637e-01],
         [ 1.42877507e+00,  1.67109203e+00,  1.84609866e+00, ...,
           2.20999479e+00,  8.92817259e-01, -1.05057120e-01],
         [ 1.39469934e+00,  1.52637506e+00,  1.69759560e+00, ...,
           4.02442646e+00,  2.09598804e+00,  7.37583160e-01]],

        [[ 2.68514872e-01,  5.11672974e-01,  1.13555503e+00, ...,
           8.38322639e-02,  4.48148966e-01,  4.94845390e-01],
         [ 3.87569666e-01,  6.28624678e-01,  1.72662306e+00, ...,
           4.68762636e

## Add variable doy with day of the year

In [None]:
def conv_npdt64_to_doy(npdt):
    """
    Input: numpy datetime64 object
    """
    dt = datetime.datetime.utcfromtimestamp(npdt.astype('O')/1e9)
    return int(dt.strftime('%j'))

In [None]:
print(ds['time'][0].values)
conv_npdt64_to_doy(ds['time'][0].values)

In [None]:
conv_npdt64_to_doy(ds['time'])

In [None]:
ds.assign(doy=conv_npdt64_to_doy(ds['time']))

___

## Check how many days are in the data

In [47]:
num_time_steps = 0
for file_path in netcdf_dirs:
    ds = xr.open_dataset(file_path)
    num_time_steps += len(ds['time'])
    print(file_path, " total days so far: ", num_time_steps/24)

../data/train/2016_01_ERA5.nc  total time steps so far:  31.0
../data/train/2016_02_ERA5.nc  total time steps so far:  60.0
../data/train/2016_03_ERA5.nc  total time steps so far:  91.0
../data/train/2016_04_ERA5.nc  total time steps so far:  121.0
../data/train/2016_05_ERA5.nc  total time steps so far:  152.0
../data/train/2016_06_ERA5.nc  total time steps so far:  182.0
../data/train/2016_07_ERA5.nc  total time steps so far:  213.0
../data/train/2016_08_ERA5.nc  total time steps so far:  244.0
../data/train/2016_09_ERA5.nc  total time steps so far:  274.0
../data/train/2016_10_ERA5.nc  total time steps so far:  305.0
../data/train/2016_11_ERA5.nc  total time steps so far:  335.0
../data/train/2016_12_ERA5.nc  total time steps so far:  366.0
../data/train/2017_01_ERA5.nc  total time steps so far:  397.0
../data/train/2017_02_ERA5.nc  total time steps so far:  425.0
../data/train/2017_03_ERA5.nc  total time steps so far:  456.0
../data/train/2017_04_ERA5.nc  total time steps so far:  4

## generate slices of arrays

In [49]:
img_dir = "../data/train"
netcdf_dirs = sorted(glob.glob(img_dir+"/*"))

In [50]:
frames = 24
channels = 5
pixels_x = 21
pixels_y = 21

In [69]:
import glob
import xarray as xr
def generate_arrays(img_dir, slice_size=24):
    """
    A generator that returns one 24-hour slice as input, and the subsequent 24-hour slice as output
    """
    # get list of netcdf files in img_dir
    netcdf_dirs = sorted(glob.glob(img_dir+"/*.nc"))
    file_index = 0
    # open first netcdf file
    ds = xr.open_dataset(netcdf_dirs[file_index])
    # counter is for hourly time slices. months with 31 days have 744 hours
    counter = 0
    while True: # generator needs to run infinitely
        # get input slice
        input_images = ds.isel( time=slice(counter, counter + slice_size))#.to_array().values
        
        # check if we're at the end of the month
        if counter+2*slice_size > ds.sizes['time']: # if output is not in file
            # reset slice counter, increment to next netcdf file, open it, get output images
            counter = 0
            file_index += 1
            if file_index == len(netcdf_dirs):
                print("End of the line. Last file at ", netcdf_dirs[file_index-1])
                break
            ds = xr.open_dataset(netcdf_dirs[file_index])
            # take slice 0-24 as output-image
            output_images = ds.isel( time=slice(counter, counter + slice_size))#.to_array().values
            # set counter to -slice_size to reset for input on next iteration
            counter -= slice_size
        # get output slice right after input slice
        else:
            output_images = ds.isel( time=slice(counter+slice_size, counter + 2*slice_size))#.to_array().values
        
#         # switch frames and channel axes
#         input_images = np.moveaxis(input_images, 0, 1)
#         output_images = np.moveaxis(output_images, 0, 1)
#         # reshape values
#         input_images = input_images.reshape(-1, frames, channels, pixels_x, pixels_y)
#         output_images = output_images.reshape(-1, frames, channels, pixels_x, pixels_y)

        yield (input_images, output_images)
        counter += slice_size

In [70]:
gen = generate_arrays("../data/train")

In [74]:
for i in range(1):
    in_, out_ = next(gen)
print(in_['time'])
print(out_['time'])

<xarray.DataArray 'time' (time: 24)>
array(['2016-02-01T00:00:00.000000000', '2016-02-01T01:00:00.000000000',
       '2016-02-01T02:00:00.000000000', '2016-02-01T03:00:00.000000000',
       '2016-02-01T04:00:00.000000000', '2016-02-01T05:00:00.000000000',
       '2016-02-01T06:00:00.000000000', '2016-02-01T07:00:00.000000000',
       '2016-02-01T08:00:00.000000000', '2016-02-01T09:00:00.000000000',
       '2016-02-01T10:00:00.000000000', '2016-02-01T11:00:00.000000000',
       '2016-02-01T12:00:00.000000000', '2016-02-01T13:00:00.000000000',
       '2016-02-01T14:00:00.000000000', '2016-02-01T15:00:00.000000000',
       '2016-02-01T16:00:00.000000000', '2016-02-01T17:00:00.000000000',
       '2016-02-01T18:00:00.000000000', '2016-02-01T19:00:00.000000000',
       '2016-02-01T20:00:00.000000000', '2016-02-01T21:00:00.000000000',
       '2016-02-01T22:00:00.000000000', '2016-02-01T23:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 2016-02-

In [30]:
in_, out_ = next(gen)
in_

End of the line. Last file at  ../data/train/2018_12_ERA5.nc


StopIteration: 

In [57]:
print(out_['time'])

<xarray.DataArray 'time' (time: 24)>
array(['2016-01-02T00:00:00.000000000', '2016-01-02T01:00:00.000000000',
       '2016-01-02T02:00:00.000000000', '2016-01-02T03:00:00.000000000',
       '2016-01-02T04:00:00.000000000', '2016-01-02T05:00:00.000000000',
       '2016-01-02T06:00:00.000000000', '2016-01-02T07:00:00.000000000',
       '2016-01-02T08:00:00.000000000', '2016-01-02T09:00:00.000000000',
       '2016-01-02T10:00:00.000000000', '2016-01-02T11:00:00.000000000',
       '2016-01-02T12:00:00.000000000', '2016-01-02T13:00:00.000000000',
       '2016-01-02T14:00:00.000000000', '2016-01-02T15:00:00.000000000',
       '2016-01-02T16:00:00.000000000', '2016-01-02T17:00:00.000000000',
       '2016-01-02T18:00:00.000000000', '2016-01-02T19:00:00.000000000',
       '2016-01-02T20:00:00.000000000', '2016-01-02T21:00:00.000000000',
       '2016-01-02T22:00:00.000000000', '2016-01-02T23:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 2016-01-

In [None]:
type(out_)

In [None]:
out_ = np.moveaxis(out_, 0, 1)

In [None]:
out_.shape

In [41]:
datetime.datetime.now().strftime("%Y_%M_%d_%H%M_")

'2020_26_08_1926_'