In [1]:
from nowcasting_dataset.data_sources import NWPDataSource
from nowcasting_dataset.data_sources.nwp_data_source import NWP_VARIABLE_NAMES, NWP_MEAN, NWP_STD
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
BUCKET = Path('solar-pv-nowcasting-data')
#NWP_BASE_PATH = BUCKET / 'NWP/UK_Met_Office/UKV_zarr'
#NWP_BASE_PATH = BUCKET / 'NWP/UK_Met_Office/UKV_single_step_and_single_timestep_all_vars.zarr'
NWP_BASE_PATH = BUCKET / 'NWP/UK_Met_Office/UKV_single_step_and_single_timestep_all_vars_2018_7-12.zarr'

In [3]:
nwp_ds = NWPDataSource(
    filename=f'gs://{NWP_BASE_PATH}',
    history_len=0,
    forecast_len=1,
    image_size_pixels=2,
    meters_per_pixel=2_000,
    n_timesteps_per_batch=4
)

In [4]:
%%time
nwp_ds.open()

CPU times: user 730 ms, sys: 40 ms, total: 770 ms
Wall time: 1.26 s


In [5]:
%%time
datetimes = nwp_ds.datetime_index()

CPU times: user 10.5 ms, sys: 369 µs, total: 10.9 ms
Wall time: 9.15 ms


In [6]:
random_dt = np.random.choice(datetimes, size=4, replace=False)

In [7]:
random_dt = np.tile(random_dt, reps=4)

In [8]:
random_dt = pd.DatetimeIndex(random_dt)

In [9]:
random_dt

DatetimeIndex(['2018-09-23 19:45:00', '2018-10-28 17:35:00',
               '2018-10-02 13:15:00', '2018-07-19 19:30:00',
               '2018-09-23 19:45:00', '2018-10-28 17:35:00',
               '2018-10-02 13:15:00', '2018-07-19 19:30:00',
               '2018-09-23 19:45:00', '2018-10-28 17:35:00',
               '2018-10-02 13:15:00', '2018-07-19 19:30:00',
               '2018-09-23 19:45:00', '2018-10-28 17:35:00',
               '2018-10-02 13:15:00', '2018-07-19 19:30:00'],
              dtype='datetime64[ns]', freq=None)

In [10]:
loc_x, loc_y = nwp_ds.get_locations_for_batch(random_dt[:1])

In [11]:
loc_x = np.tile(loc_x, reps=16)
loc_y = np.tile(loc_y, reps=16)

In [12]:
%%time
examples = nwp_ds.get_batch(random_dt, loc_x, loc_y)

2018-09-23 19:45:00 2018-09-23 19:45:00 2018-09-23 19:50:00
2018-10-28 17:35:00 2018-10-28 17:35:00 2018-10-28 17:40:00
2018-10-02 13:15:00 2018-10-02 13:15:00 2018-10-02 13:20:00
2018-07-19 19:30:00 2018-07-19 19:30:00 2018-07-19 19:35:00
2018-09-23 19:45:00 2018-09-23 19:45:00 2018-09-23 19:50:00
2018-10-28 17:35:00 2018-10-28 17:35:00 2018-10-28 17:40:00
2018-10-02 13:15:00 2018-10-02 13:15:00 2018-10-02 13:20:00
2018-07-19 19:30:00 2018-07-19 19:30:00 2018-07-19 19:35:00
2018-09-23 19:45:00 2018-09-23 19:45:00 2018-09-23 19:50:00
2018-10-28 17:35:00 2018-10-28 17:35:00 2018-10-28 17:40:00
2018-10-02 13:15:00 2018-10-02 13:15:00 2018-10-02 13:20:00
2018-07-19 19:30:00 2018-07-19 19:30:00 2018-07-19 19:35:00
2018-09-23 19:45:00 2018-09-23 19:45:00 2018-09-23 19:50:00
2018-10-28 17:35:00 2018-10-28 17:35:00 2018-10-28 17:40:00
2018-10-02 13:15:00 2018-10-02 13:15:00 2018-10-02 13:20:00
2018-07-19 19:30:00 2018-07-19 19:30:00 2018-07-19 19:35:00
CPU times: user 642 ms, sys: 69.9 ms, to

In [14]:
for example in examples:
    print(example['nwp'].shape)

(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)
(10, 2, 2, 2)


In [14]:
examples

[{'nwp': array([[[[ 2.852e+02,  2.865e+02],
           [ 2.858e+02,  2.865e+02]],
  
          [[ 2.840e+02,  2.860e+02],
           [ 2.852e+02,  2.858e+02]]],
  
  
         [[[ 1.074e+02,  1.210e+02],
           [ 1.045e+02,  1.345e+02]],
  
          [[ 2.659e+01,  2.870e+01],
           [ 2.450e+01,  3.030e+01]]],
  
  
         [[[ 0.000e+00,  0.000e+00],
           [ 0.000e+00,  0.000e+00]],
  
          [[ 0.000e+00,  0.000e+00],
           [ 0.000e+00,  0.000e+00]]],
  
  
         [[[ 5.603e+01,  5.522e+01],
           [ 5.584e+01,  5.794e+01]],
  
          [[ 6.147e+01,  5.688e+01],
           [ 5.528e+01,  5.769e+01]]],
  
  
         [[[-0.000e+00, -0.000e+00],
           [-0.000e+00, -0.000e+00]],
  
          [[-0.000e+00, -0.000e+00],
           [-0.000e+00, -0.000e+00]]],
  
  
         [[[ 6.750e+00,  6.648e+00],
           [ 6.148e+00,  6.852e+00]],
  
          [[ 4.352e+00,  5.352e+00],
           [ 4.648e+00,  6.051e+00]]],
  
  
         [[[ 4.915e+04,  4.995e+0

In [13]:
np.isnan(examples[0]['nwp']).any()

False

In [14]:
examples[7]['nwp'].mean()

0.24319429287376507

In [15]:
dask.visualize([example['nwp'] for example in examples_delayed], optimize_graph=True)

NameError: name 'dask' is not defined

In [None]:
loc_x, loc_y

In [None]:
from concurrent import futures

In [None]:
import xarray as xr

In [None]:
def _get_time_slice(self, t0_dt):
    start_dt = self._get_start_dt(t0_dt)
    end_dt = self._get_end_dt(t0_dt)

    start_hourly = start_dt.floor('H')
    end_hourly = end_dt.ceil('H')
    
    init_time_i = np.searchsorted(self.data.init_time, start_hourly.to_numpy(), side='right')
    init_time_i -= 1
    init_time = self.data.init_time.values[init_time_i]
    
    step_start = start_hourly - init_time
    step_end = end_hourly - init_time
    
    selected = self.data.sel(init_time=init_time, step=slice(step_start, step_end))
    selected = selected.swap_dims({'step': 'target_time'})
    selected['target_time'] = init_time + selected.step
    return selected

In [None]:
%%time
d = _get_time_slice(self=nwp_ds, t0_dt=t0_dt)

In [None]:
%%time


In [None]:
####################
# TODO: Figure out why _get_time_slice is actually loading data.  Then re-write NOT using dask; and using threads to load data :)

In [None]:
selections[0].chunks

In [None]:
%%time
_ = selections[0].load()

In [None]:
%%time


In [None]:
def _post_process_example(
        self,
        selected_data: xr.DataArray,
        t0_dt: pd.Timestamp) -> xr.DataArray:
    """Resamples to 5 minutely."""
    start_dt = self._get_start_dt(t0_dt)
    end_dt = self._get_end_dt(t0_dt)
    selected_data = selected_data - NWP_MEAN
    selected_data = selected_data / NWP_STD
    selected_data = selected_data.resample({'target_time': '5T'})
    selected_data = selected_data.interpolate()
    selected_data = selected_data.sel(target_time=slice(start_dt, end_dt))
    return selected_data

In [None]:
from nowcasting_dataset.example import to_numpy

In [None]:
%%time
selections = []
for i, t0_dt in enumerate(datetimes[100:104]):
    selections.append(_get_time_slice(self=nwp_ds, t0_dt=t0_dt))
    
len(selections)


data = []
with futures.ThreadPoolExecutor(max_workers=8) as executor:
    data_futures = []
    # Submit tasks.
    for selection in selections:
        future = executor.submit(selection.load)
        data_futures.append(future)
        
    # Grab tasks
    for future in data_futures:
        data.append(future.result())
        

examples = []
for selected_data in data:
    for x_meters_center, y_meters_center in zip(loc_x[:4], loc_y[:4]):
        bounding_box = nwp_ds._square.bounding_box_centered_on(
            x_meters_center=x_meters_center, y_meters_center=y_meters_center)
        selected_data = selected_data.sel(
            x=slice(bounding_box.left, bounding_box.right),
            y=slice(bounding_box.top, bounding_box.bottom))

        # selected_sat_data is likely to have 1 too many pixels in x and y
        # because sel(x=slice(a, b)) is [a, b], not [a, b).  So trim:
        selected_data = selected_data.isel(
            x=slice(0, nwp_ds._square.size_pixels),
            y=slice(0, nwp_ds._square.size_pixels))

        selected_data = _post_process_example(nwp_ds, selected_data, t0_dt)

        example = nwp_ds._put_data_into_example(selected_data)
        example = to_numpy(example)
        examples.append(example)

In [None]:
len(examples)

In [None]:
examples[0]

In [None]:
%%time


In [None]:
dask.visualize([example['nwp'] for example in examples], optimize_graph=True)

In [None]:
%%time
