# Notebook emulating plots from our WHAFT proposal

In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import os

import gcsfs
from tqdm import tqdm
import fsspec
xr.set_options(display_style="html");

# Dask Startup

In [2]:
## A chunk of code to start dask cluster
from dask.distributed import Client, progress
from dask_gateway import Gateway
gateway = Gateway()

In [3]:
if gateway.list_clusters():
    cluster = gateway.new_cluster() 
    cluster.adapt(minimum=2, maximum=12)
    print("Starting up and connecting to new cluster.")
else:
    cluster_name = gateway.list_clusters()[0].name
    cluster = gateway.connect(cluster_name)
    print(f"Connecting to pre-existing cluster with {len(cluster.scheduler_info['workers'])} workers.")
    
cluster
# cluster.scheduler_info

Starting up and connecting to new cluster.


VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [5]:
client = cluster.get_client()
client

0,1
Client  Scheduler: gateway://traefik-ooi-prod-dask-gateway.ooi-prod:80/ooi-prod.b02f08007be842bcb1f0d8b9fca1e98f  Dashboard: /services/dask-gateway/clusters/ooi-prod.b02f08007be842bcb1f0d8b9fca1e98f/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


### setup runtime parameters

In [6]:
gcsdir = 'gs://ldeo-glaciology'
ampsdir = 'AMPS'
amps_ver = 'WRF_24'
domain = 'domain_03'
filepattern = 'wrf-20161225*'


In [7]:
# pattern = 'gs://ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-20161225*'
pattern = os.path.join(gcsdir, ampsdir, amps_ver, domain, filepattern)
print(pattern)

gs://ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-20161225*


### list the netcdf files

In [8]:
fs = gcsfs.GCSFileSystem(project='ldeo-glaciology', mode='ab', cache_timeout = 0)

In [9]:
NCs = fs.glob(pattern)
print(f"Total of {len(NCs)} wrf files.")
print(NCs[0])

Total of 8 wrf files.
ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f003-cf.nc


In [29]:
## load the first file to inialize the xarray
url = 'gs://' + NCs[0]
with  fsspec.open(url, mode='rb')  as openfile:  
    ds = xr.open_dataset(openfile, engine='h5netcdf', chunks={'lat': 200, 'lon': 200, 'time': 1})  # , decode_coords=False
    
## load the other files, each time concaternating them onto an xarray (AMPS) that grows in the time dimension each iteration. 
for i in tqdm(range(1, len(NCs)-1)):  
    url = 'gs://' + NCs[i]
    with  fsspec.open(url, mode='rb')  as openfile:  
        temp = xr.open_dataset(openfile, engine='h5netcdf', chunks={'lat': 200, 'lon': 200, 'time': 1})  # these chunk sizes produce chunks of reasonable data volumes and which stretch through all time
    ds = xr.concat([ds,temp],'time')

ValueError: some chunks keys are not dimensions on this object: {'lon', 'lat'}

In [None]:
print(type(ds.T_sfc.data))
print(ds.T_sfc.chunks)

In [None]:
ds.T_sfc


### Alternatively using `open_mfdatasets`

Build list of URLs to public data in the bucket

In [28]:
url = 'https://storage.googleapis.com/ldeo-glaciology/bedmachine/BedMachineAntarctica_2019-11-05_v01.nc#mode=bytes'  
ds = xr.open_dataset(url, engine='netcdf4', chunks=3000)
ds

FileNotFoundError: [Errno 2] No such file or directory: b'https://storage.googleapis.com/ldeo-glaciology/bedmachine/BedMachineAntarctica_2019-11-05_v01.nc#mode=bytes'

In [27]:
NCs_urls = ['https://storage.googleapis.com/' + x + '#mode=bytes' for x in NCs]
print(NCs_urls)
# xr.open_mfdataset(NCs_urls, parallel=True)
# ds = xr.open_mfdataset(NCs_urls, engine='netcdf4', chunks={'time': 1})
ds = xr.open_dataset(NCs_urls[0], engine='netcdf4', chunks={'time': -1})

['https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f003-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f006-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f009-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f012-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122512_f003-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122512_f006-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122512_f009-cf.nc#mode=bytes', 'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122512_f012-cf.nc#mode=bytes']


FileNotFoundError: [Errno 2] No such file or directory: b'https://storage.googleapis.com/ldeo-glaciology/AMPS/WRF_24/domain_03/wrf-2016122500_f003-cf.nc#mode=bytes'

In [16]:
xr.show_versions()




INSTALLED VERSIONS
------------------
commit: None
python: 3.7.8 | packaged by conda-forge | (default, Jul 31 2020, 02:23:50) 
[GCC 7.5.0]
python-bits: 64
OS: Linux
OS-release: 4.15.0-1096-azure
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: C.UTF-8
LANG: C.UTF-8
LOCALE: en_US.UTF-8
libhdf5: 1.10.5
libnetcdf: 4.7.4

xarray: 0.16.1
pandas: 1.1.4
numpy: 1.19.2
scipy: 1.5.3
netCDF4: 1.5.3
pydap: installed
h5netcdf: 0.8.1
h5py: 2.10.0
Nio: None
zarr: 2.5.0
cftime: 1.2.1
nc_time_axis: None
PseudoNetCDF: None
rasterio: 1.1.5
cfgrib: None
iris: 2.4.0
bottleneck: 1.3.2
dask: 2.30.0
distributed: 2.30.0
matplotlib: 3.3.2
cartopy: 0.17.0
seaborn: 0.11.0
numbagg: None
pint: 0.16.1
setuptools: 49.6.0.post20201009
pip: 20.2.4
conda: None
pytest: None
IPython: 7.19.0
sphinx: None


## Quick overview plot

In [None]:
ds['T_sfc'].mean(dim='time').plot()

## Test write to zarr

In [None]:
outdir = os.path.join(gcsdir, ampsdir, amps_ver, domain, 'zarr-cf/')
fs.glob(outdir)

In [None]:
import json

with open('secrets/ldeo-glaciology-bc97b12df06b.json') as token_file:
    token = json.load(token_file)
# gcs = gcsfs.GCSFileSystem(token=token)

amps_mapper = fsspec.get_mapper(outdir + 'test_20161225-cf.zarr', mode='ab',
                            token=token)
ds.to_zarr(amps_mapper, mode='w');

## Close your cluster, be a good denizen.

In [None]:
cluster.shutdown()