# Create Zarr of 9 years of daily CSSP China data [1851-1859]

In [1]:
import iris
import os
import sys
import xarray as xr
import numpy as np

import crd_utils as crd
import umdates_utils as um

from datetime import datetime, timedelta

## Create a list of all the files we want to process

In [2]:
# daily data filenames
filepath = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files = sorted(os.listdir(filepath))
print(len(files))

5850


In [3]:
# generate all possible filenames for the time period
runid = 'apepd'
startd = datetime(1851, 1, 1)   # 00Z on Jan 01 1851
endd = datetime(1860, 1, 1)     # 00Z on Jan 01 1860
freq = 'pa'

decade_filenames = um.UMFileList(runid, startd, endd, freq)
print(len(decade_filenames))

3288


In [4]:
# find the overlap of actual filenames with all possible filenames in that decade
filenames = list(set(files).intersection(set(decade_filenames)))
filenames.sort()
print(len(filenames))

329


In [5]:
filepaths = [os.path.join(filepath, filename) for filename in filenames]
print(len(filepaths))

329


## Set all the variables we need to process the cubes

In [6]:
%%time
cubelist = iris.load(filepaths[0:1])
cubelist[0]

CPU times: user 1.45 s, sys: 185 ms, total: 1.64 s
Wall time: 2.65 s


M01S05I217 (unknown),time,grid_latitude,grid_longitude
Shape,4,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
Attributes,,,


In [7]:
print(cubelist)

0: m01s05i217 / (unknown)              (time: 4; grid_latitude: 219; grid_longitude: 286)
1: air_pressure_at_sea_level / (Pa)    (time: 4; grid_latitude: 219; grid_longitude: 286)
2: air_temperature / (K)               (time: 4; grid_latitude: 219; grid_longitude: 286)
3: air_temperature / (K)               (time: 4; grid_latitude: 219; grid_longitude: 286)
4: air_temperature / (K)               (time: 4; grid_latitude: 219; grid_longitude: 286)
5: air_temperature / (K)               (time: 4; pressure: 14; grid_latitude: 219; grid_longitude: 286)
6: cloud_area_fraction / (1)           (time: 4; grid_latitude: 219; grid_longitude: 286)
7: geopotential_height / (m)           (time: 4; pressure: 14; grid_latitude: 219; grid_longitude: 286)
8: precipitation_flux / (kg m-2 s-1)   (time: 4; grid_latitude: 219; grid_longitude: 286)
9: relative_humidity / (%)             (time: 4; grid_latitude: 219; grid_longitude: 286)
10: relative_humidity / (%)             (time: 4; grid_latitude: 219; gr

In [8]:
CUBENAMES = [cube.name() for cube in cubelist]
UNIQUE_COORDS = crd.unique_coords_list(cubelist)
COORD_NAME_MAPPING = crd.get_new_coord_names(UNIQUE_COORDS)

In [9]:
print([coord.name() for coord in UNIQUE_COORDS])

['time', 'grid_latitude', 'grid_longitude', 'forecast_reference_time', 'forecast_period', 'height', 'pressure', 'grid_latitude', 'grid_longitude', 'height']


In [10]:
COORD_NAME_MAPPING[1]

('grid_latitude_1', 'grid_longitude_1', 'height_1')

In [11]:
ZARR = '/data/cssp-china/zarr_1851-1859'
CHUNKS = {'time': 200, 'pressure':1, 'grid_latitude': 219, 'grid_longitude': 286, 'grid_latitude_1': 218, 'grid_longitude_1': 286}

## Create a Zarr with 21 cubes, which definitely amounts to 200 time steps (~20 days)

In [12]:
%%time
cubelist0 = iris.load(filepaths[0:21])
cubelist0[7]

CPU times: user 1min 14s, sys: 6.76 s, total: 1min 20s
Wall time: 1min 44s


Geopotential Height (m),time,pressure,grid_latitude,grid_longitude
Shape,204,14,219,286
Dimension coordinates,,,,
time,x,-,-,-
pressure,-,x,-,-
grid_latitude,-,-,x,-
grid_longitude,-,-,-,x
Auxiliary coordinates,,,,
forecast_period,x,-,-,-
Scalar coordinates,,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00


In [13]:
%%time
crd.rename_cubes(cubelist0, CUBENAMES, COORD_NAME_MAPPING, dryrun=False)

m01s05i217 -> m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_min
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
geopotential_height
  x time
  x 

In [14]:
%%time
dalist0 = crd.cubelist_to_dalist(cubelist0)
ds0 = xr.merge(dalist0)

CPU times: user 244 ms, sys: 4.26 ms, total: 249 ms
Wall time: 249 ms


In [15]:
ds0

In [16]:
%%time
crd.ds_to_zarr(ds0, ZARR, chunks=CHUNKS)

Written dataset to /data/cssp-china/zarr_1851-1859
CPU times: user 47.6 s, sys: 10.8 s, total: 58.3 s
Wall time: 2min 4s


In [17]:
dsz0 = xr.open_zarr(ZARR)
dsz0

## Append a couple of cubes without rechunking

In [15]:
%%time
cubelist1 = iris.load(filepaths[21:24])
cubelist1[0]

CPU times: user 11 s, sys: 523 ms, total: 11.5 s
Wall time: 14.8 s


M01S05I217 (unknown),time,grid_latitude,grid_longitude
Shape,30,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
Attributes,,,


In [16]:
%%time
crd.rename_cubes(cubelist1, CUBENAMES, COORD_NAME_MAPPING, dryrun=False)
dalist1 = crd.cubelist_to_dalist(cubelist1)
ds1 = xr.merge(dalist1)
ds1

m01s05i217 -> m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_min
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
geopotential_height
  x time
  x 

In [17]:
%%time
crd.ds_to_zarr(ds1, zarr)

Appended dataset to ../zarr_append_subchunks
CPU times: user 30.8 s, sys: 12.2 s, total: 43.1 s
Wall time: 1min 36s


In [18]:
dsz1 = xr.open_zarr(zarr)
dsz1

## How big are the new Datasets?

In [21]:
dsz0.nbytes / 1e9

4.541737716

In [24]:
# loop through 
step = 12
start = 21
for i in range(start, len(filepaths), step):
    try:
        log(f'attempting to load filepaths {i}-{i+step}')
        cubelist = iris.load(filepaths[i:i+step])
        log(f'loaded files successfully')
        crd.rename_cubes(cubelist, CUBENAMES, COORD_NAME_MAPPING, dryrun=False, verbose=False)
        log(f'renamed files successfully')
        dalist = crd.cubelist_to_dalist(cubelist)
        ds = xr.merge(dalist)
        log(f'created dataset')
        crd.ds_to_zarr(ds, ZARR, chunks=CHUNKS)
        log(f'appended to zarr {ZARR}')
#         print(f'Number of cubes: {len(cubelist)}')
    except:
        print("Unexpected error:", sys.exc_info()[0])

Unexpected error: <class 'KeyboardInterrupt'>


In [29]:
step = 10
start = 21
len(list(range(start, len(filepaths), step)))

31