In [1]:
import xarray as xr
import dask
import dask.array as da
import numpy as np
import pandas as pd
import dask.multiprocessing
import blosc
from time import sleep

import zarr
import h5py
import gcsfs
from dask_kubernetes import KubeCluster
from dask.distributed import Client, progress, LocalCluster

  from ._conv import register_converters as _register_converters


In [2]:
fs = gcsfs.GCSFileSystem(project='pangeo-181919', token='cache')
token = fs.session.credentials
fs2 = gcsfs.GCSFileSystem(project='pangeo-181919', token=token)
gcsmap_zarr = gcsfs.GCSMap('storage-benchmarks/llc4320_zarr', gcs=fs2)

In [9]:
%time llc_zarr = xr.open_zarr(gcsmap_zarr)

CPU times: user 1.72 s, sys: 129 ms, total: 1.85 s
Wall time: 9.72 s


In [8]:
%time df = llc_zarr.persist()
#%time df.mean(dim='time').max().compute()
%time df.W.max().compute()

CPU times: user 628 ms, sys: 18 ms, total: 646 ms
Wall time: 635 ms
CPU times: user 35.4 s, sys: 2.22 s, total: 37.6 s
Wall time: 53 s


<xarray.DataArray 'W' ()>
array(0.128032, dtype=float32)

In [10]:
df = llc_zarr.W.persist()
%time df[:,0].mean().compute()

CPU times: user 4 s, sys: 249 ms, total: 4.25 s
Wall time: 8.28 s


<xarray.DataArray 'W' ()>
array(-4.73068e-06, dtype=float32)
Coordinates:
    Zl       float32 0.0

In [10]:
n_workers = 40
cluster = KubeCluster(n_workers=n_workers)
cluster

In [9]:
cluster.close()

In [11]:
#client = Client(cluster)
from dask.distributed import Client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.20.69.40:36074  Dashboard: /user/kaipak/proxy/36169/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [None]:
while len(client.cluster.scheduler.workers) < n_workers:
        print("Provisioning worker pods. %s/%s " % 
              (len(client.cluster.scheduler.workers), n_workers))
        sleep(2)
    
len(client.cluster.scheduler.workers)

In [None]:
def randn(shape, frac_nan=None, chunks=None, seed=0):
    rng = np.random.RandomState(seed)
    if chunks is None:
        x = rng.standard_normal(shape)
    else:
        import dask.array as da
        rng = da.random.RandomState(seed)
        x = rng.standard_normal(shape, chunks=chunks)

    if frac_nan is not None:
        inds = rng.choice(range(x.size), int(x.size * frac_nan))
        x.flat[inds] = np.nan

    return x

def randint(low, high=None, size=None, frac_minus=None, seed=0):
    rng = np.random.RandomState(seed)
    x = rng.randint(low, high, size)
    if frac_minus is not None:
        inds = rng.choice(range(x.size), int(x.size * frac_minus))
        x.flat[inds] = -1

    return x

def rand_numpy(nz=None, empty=True):
    """
    Generate random 3D Numpy dataset.

    :params;

    """
    if nz == None:
        nz = getTestConfigValue("num_slices")
    if not nz or nz <= 0:
        raise NotImplementedError("num_slices invalid")
    ny = 1000
    nx = 1000
    dtype = 'f8'
    # Create a dataset
    #dset = f.create_dataset(_DATASET_NAME, shape=(nz,ny,nx), dtype=dtype)

    data = np.random.rand(nz,nx,ny)
    return data

def rand_xarray(nt=None):
    """
    Generate synthetic geoscience-like Xarray dataset filled with random 
    data.
    :param ds: dataset that gets generated.
    :param nt: number of timesteps for data. Primary control over how large
               the dataset is.
    :returns: A synthetic xarray dataset that mimics geodata.
    """

    ds = xr.Dataset()
    if nt == None:
        nt = getTestConfigValue("ntime_slices")
    ny = 1000
    nx = 1000
    block_chunks = {'time': nt / 4,
                             'lon': nx / 3,
                             'lat': ny / 3}

    time_chunks = {'time': int(nt / 36)}

    times = pd.date_range('1970-01-01', periods=nt, freq='D')
    lons = xr.DataArray(np.linspace(0, 360, nx), dims=('lon', ),
                        attrs={'units': 'degrees east',
                               'long_name': 'longitude'})
    lats = xr.DataArray(np.linspace(-90, 90, ny), dims=('lat', ),
                        attrs={'units':'degrees north',
                               'long_name': 'latitude'})
    ds['foo'] = xr.DataArray(randn((nt, nx, ny), frac_nan=0.2),
                             coords={'lon': lons, 'lat': lats,'time': times},
                             dims=('time', 'lon', 'lat'),
                             name='foo', encoding=None,
                             attrs={'units': 'foo units',
                                    'description': 'a description'})
    ds['bar'] = xr.DataArray(randn((nt, nx, ny), frac_nan=0.2),
                             coords={'lon': lons, 'lat': lats, 'time': times},
                             dims=('time', 'lon', 'lat'),
                             name='bar', encoding=None,
                             attrs={'units': 'bar units',
                                    'description': 'a description'})
    ds['baz'] = xr.DataArray(randn((nx, ny), frac_nan=0.2).astype(np.float32),
                             coords={'lon': lons, 'lat': lats},
                             dims=('lon', 'lat'),
                             name='baz', encoding=None,
                             attrs={'units': 'baz units',
                                    'description': 'a description'})

    ds.attrs = {'history': 'created for xarray benchmarking'}

    oinds = {'time': randint(0, nt, 120),
             'lon': randint(0, nx, 20),
             'lat': randint(0, ny, 10)}
    vinds = {'time': xr.DataArray(randint(0, nt, 120), dims='x'),
             'lon': xr.DataArray(randint(0, nx, 120), dims='x'),
             'lat': slice(3, 20)}

    return ds



In [None]:
np_ds = rand_numpy(32)
np_ds.nbytes/ 1024**2

In [None]:
xr_ds = rand_xarray(16)
print(xr_ds.nbytes/ 1024**2)

In [None]:
fs2 = gcsfs.GCSFileSystem(project='pangeo-181919', token=None)
gcsmap_zarr = gcsfs.GCSMap('storage-benchmarks/test_zarr', gcs=fs2)

In [None]:
xr_ds.to_zarr(store=gcsmap_zarr)

In [None]:
%time xr_ds.mean()

In [None]:
chunks = (1, 1000, 1000)
size = (1350, 1000, 1000)
dask_arr = da.random.normal(10, 0.1, size=size, chunks=chunks)
dask_arr.nbytes/ 1024**3

In [None]:
%time dask_arr.mean().compute()

In [None]:
! gsutil -q -m rm -rf gs://storage-benchmarks/gcsfs-test-nb-zarr

In [None]:
fs = gcsfs.GCSFileSystem(project='pangeo-181919', token='cache')
gcsmapcache =  gcsfs.GCSMap('storage-benchmarks/gcsfs-test-nb')
token = fs.session.credentials

In [None]:
token = fs.session.credentials

In [None]:
fs2 = gcsfs.GCSFileSystem(project='pangeo-181919', token=token)

In [None]:
gcsmap_zarr = gcsfs.GCSMap('storage-benchmarks/gcsfs-test-nb-zarr', gcs=fs2)

In [None]:
zarr_ds = zarr.create(dask_arr.shape, chunks=chunks, 
                       dtype=dask_arr.dtype, store=gcsmap_zarr, overwrite=True)

In [None]:
def test_store(zarr_arr):
    for get in [dask.get, dask.threaded.get, dask.multiprocessing.get]:
        with dask.set_options(get=get):
            %time dask_arr.store(zarr_arr, lock=False)

In [None]:
%time dask_arr.store(zarr_ds, lock=False)

In [None]:
my_data = da.from_array(zarr_ds, chunks=chunks)

In [None]:
# Compute speed

%time my_data.mean().compute()

In [None]:
# Load into memory speed
#%time my_data.compute()
%time my_data.compute(chunks=chunks)

In [None]:
dsa = xr.DataArray(np.random.rand(100,1000,1000),
                  dims=['time', 'y', 'x']
                  ).chunk({'time': 1}).to_dataset(name='foo')

In [None]:
dsa.nbytes / 1024**2

In [None]:
%time dsa.to_zarr(gcsmapcache2)

In [None]:
#zarr_loc = zarr.create(dask_arr.shape, chunks=chunks, 
#                       dtype=dask_arr.dtype, path='/home/jovyan/baz', overwrite=True)
#zarr_dir = zarr.DirectoryStore('/home/jovyan/foo')
zloc = zarr.create(shape=dask_arr.shape, chunks=chunks, 
                   dtype=dask_arr.dtype, store='/gcs/storage-benchmarks/foo', 
                   overwrite=True)

In [None]:
dask_arr.store(zloc, lock=False)

In [None]:
my_fusedata = da.from_array(zloc, chunks=chunks)

In [None]:
%time my_fusedata.mean().compute()

In [None]:
# Read FUSE Mount
zfuse = xr.open_zarr('/gcs/storage-benchmarks/llc4320_zarr_fuse')

In [None]:
zfuse

In [None]:
zfuse.W.mean().compute()

In [5]:
# Opening NetCDF Files on FUSE

llc_ds = xr.open_mfdataset('/gcs/storage-benchmarks/llc4320_netcdf/*.nc',
                                        decode_cf=False, autoclose=True,
                                        chunks={'k': 1, 'k_l': 10})

In [None]:
ds = llc_ds.persist()
ds.Theta.max().compute()