# How many unique dims are in the "mini" dataset?
### Where the "mini" dataset represents 10% of the total CSSP China dataset (~3.5TB) utilised by 90% of users.

Strategy:
1. For each of the time frequencies of the data
2. Run through all the cubes loaded from .pp
3. Comparing each dim to a list of unique_dims
4. Add any more unique dims found
5. At the end, eye-ball the unique dim list to determine how many 'domains' exist in the data

Hypothesis: Only two dims exist, which are based on two spatial grids (because vector grids are different from scalar grids).

In [1]:
import iris
import os
import xarray as xr
import numpy as np

import crd_utils as crd

### Let's start with daily data to develop the approach

In [2]:
# Variables for loading data
filepath = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files = sorted(os.listdir(filepath))

In [None]:
STASH_TEMP = 'm01s16i203'
STASH_PRECIP = 'm01s05i216'
STASH_UWIND = 'm01s03i225'
STASH_VWIND = 'm01s03i226'

In [4]:
cubelist = crd.file_to_cubelist(files[1], filepath)
cubelist

Cubelist loaded from apepda.pa511f0.pp


[<iris 'Cube' of m01s05i217 / (unknown) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of air_pressure_at_sea_level / (Pa) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of air_temperature / (K) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of air_temperature / (K) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of air_temperature / (K) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of air_temperature / (K) (time: 10; pressure: 14; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of cloud_area_fraction / (1) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of geopotential_height / (m) (time: 10; pressure: 14; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of precipitation_flux / (kg m-2 s-1) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of relative_humidity / (%) (time: 10; grid_latitude: 219; grid_longitude: 286)>,
<iris 'Cube' of r

In [32]:
for cube in cubelist:
    print(cube.name())
    for dim in cube.dim_coords:
        print(f'    {dim.name()}')

m01s05i217
    time
    grid_latitude
    grid_longitude
air_pressure_at_sea_level
    time
    grid_latitude
    grid_longitude
air_temperature
    time
    grid_latitude
    grid_longitude
air_temperature
    time
    grid_latitude
    grid_longitude
air_temperature
    time
    grid_latitude
    grid_longitude
air_temperature
    time
    pressure
    grid_latitude
    grid_longitude
cloud_area_fraction
    time
    grid_latitude
    grid_longitude
geopotential_height
    time
    pressure
    grid_latitude
    grid_longitude
precipitation_flux
    time
    grid_latitude
    grid_longitude
relative_humidity
    time
    grid_latitude
    grid_longitude
relative_humidity
    time
    grid_latitude
    grid_longitude
relative_humidity
    time
    grid_latitude
    grid_longitude
relative_humidity
    time
    pressure
    grid_latitude
    grid_longitude
specific_humidity
    time
    grid_latitude
    grid_longitude
surface_air_pressure
    time
    grid_latitude
    grid_longitude


In [26]:
unique_dims_daily = []
for cube in cubelist[0:4]:
    for dim in cube.dim_coords:
        if not dim in unique_dims_daily:
            unique_dims_daily.append(dim)

len(unique_dims_daily)

3

In [33]:
unique_dims_daily = []
for cube in cubelist:
    for dim in cube.dim_coords:
        if not dim in unique_dims_daily:
            unique_dims_daily.append(dim)

len(unique_dims_daily)

6

In [34]:
for dim in unique_dims_daily:
    print(dim.name())

time
grid_latitude
grid_longitude
pressure
grid_latitude
grid_longitude


### As suspected, there are only two grid systems for daily data
That was for daily data, now we can check it for all the other time frequencies

In [3]:
def unique_dims(cubelist):
    unique = []
    for cube in cubelist:
        for dim in cube.dim_coords:
            if not dim in unique:
                unique.append(dim)
    return unique

def unique_coords(cubelist):
    unique = []
    for cube in cubelist:
        for coord in cube.coords():
            if not coord in unique:
                unique.append(coord)
    return unique

In [4]:
os.listdir('/data/cssp-china/mini-dataset-24-01-19/20CR/')

['variableslist.pdf',
 '.ipynb_checkpoints',
 '3hrly',
 '6hrly',
 'daily',
 'hourly',
 'monthly']

In [5]:
# 1 hourly data
filepath_1h = '/data/cssp-china/mini-dataset-24-01-19/20CR/hourly'
files_1h = sorted(os.listdir(filepath_1h))
cubelist_1h = crd.file_to_cubelist(files_1h[1], filepath_1h)
print(f'Number of cubes: {len(cubelist_1h)}')

Cubelist loaded from apepda.pj511f0.pp
Number of cubes: 3


In [6]:
unique_coords_1h = unique_coords(cubelist_1h)
print(f'Number of unique coords in 1 hourly data: {len(unique_coords_1h)}')
for coord in unique_coords_1h:
    print(f'    {coord.name()}')

Number of unique coords in 1 hourly data: 8
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    grid_latitude
    grid_longitude
    height


In [7]:
# 3 hourly data
filepath_3h = '/data/cssp-china/mini-dataset-24-01-19/20CR/3hrly'
files_3h = sorted(os.listdir(filepath_3h))
cubelist_3h = crd.file_to_cubelist(files_3h[1], filepath_3h)
print(f'Number of cubes: {len(cubelist_3h)}')

Cubelist loaded from apepda.pc511f0.pp
Number of cubes: 8


In [8]:
unique_coords_3h = unique_coords(cubelist_3h)
print(f'Number of unique coords in 3 hourly data: {len(unique_coords_3h)}')
for coord in unique_coords_3h:
    print(f'    {coord.name()}')

Number of unique coords in 3 hourly data: 8
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    height
    time
    forecast_period


In [9]:
# 6 hourly data
filepath_6h = '/data/cssp-china/mini-dataset-24-01-19/20CR/6hrly'
files_6h = sorted(os.listdir(filepath_6h))
cubelist_6h = crd.file_to_cubelist(files_6h[1], filepath_6h)
print(f'Number of cubes: {len(cubelist_6h)}')

Cubelist loaded from apepda.pb511f0.pp
Number of cubes: 12


In [10]:
unique_coords_6h = unique_coords(cubelist_6h)
print(f'Number of unique coords in 6 hourly data: {len(unique_coords_6h)}')
for coord in unique_coords_6h:
    print(f'    {coord.name()}')

Number of unique coords in 6 hourly data: 12
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    time
    pressure
    forecast_period
    grid_latitude
    grid_longitude
    height
    height


In [11]:
# daily data
filepath_d = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files_d = sorted(os.listdir(filepath_d))
cubelist_d = crd.file_to_cubelist(files_d[1], filepath_d)
print(f'Number of cubes: {len(cubelist_d)}')

Cubelist loaded from apepda.pa511f0.pp
Number of cubes: 24


In [12]:
unique_coords_d = unique_coords(cubelist_d)
print(f'Number of unique coords in daily data: {len(unique_coords_d)}')
for coord in unique_coords_d:
    print(f'    {coord.name()}')

Number of unique coords in daily data: 10
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    height
    pressure
    grid_latitude
    grid_longitude
    height


In [13]:
# monthly data
filepath_m = '/data/cssp-china/mini-dataset-24-01-19/20CR/monthly'
files_m = sorted(os.listdir(filepath_m))
cubelist_m = crd.file_to_cubelist(files_m[1], filepath_m)
print(f'Number of cubes: {len(cubelist_m)}')

Cubelist loaded from apepda.pm51aug.pp
Number of cubes: 20


In [14]:
unique_coords_m = unique_coords(cubelist_m)
print(f'Number of unique coords in monthly data: {len(unique_coords_m)}')
for coord in unique_coords_m:
    print(f'    {coord.name()}')

Number of unique coords in monthly data: 14
    grid_latitude
    grid_longitude
    forecast_period
    forecast_reference_time
    time
    forecast_period
    forecast_reference_time
    height
    pressure
    forecast_period
    forecast_reference_time
    grid_latitude
    grid_longitude
    height


### Now we have lists of all the unique coords in each dataset, we can filter cubes based on their unique domains

In [15]:
cubelist_all = cubelist_1h + cubelist_3h + cubelist_6h + cubelist_d + cubelist_m
print(f'Total number of cubes: {len(cubelist_all)}')

Total number of cubes: 67


In [16]:
unique_coords_all = unique_coords(cubelist_all)
print(f'Number of unique coords in all data: {len(unique_coords_all)}')
for coord in unique_coords_all:
    print(f'    {coord.name()}')

Number of unique coords in all data: 27
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    grid_latitude
    grid_longitude
    height
    time
    forecast_period
    height
    time
    forecast_period
    time
    forecast_period
    time
    pressure
    forecast_period
    time
    forecast_period
    forecast_period
    forecast_reference_time
    time
    forecast_period
    pressure
    forecast_period
    forecast_reference_time


## Q: If I rename a Dataset then load it into Iris, how does Iris deal with that?

In [17]:
cube = cubelist_d[2]
cube

Air Temperature (K),time,grid_latitude,grid_longitude
Shape,10,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [18]:
da = crd.cube_to_da(cube)
da

<xarray.DataArray 'air_temperature' (time: 10, grid_latitude: 219, grid_longitude: 286)>
dask.array<filled, shape=(10, 219, 286), dtype=float32, chunksize=(1, 219, 286), chunktype=numpy.ndarray>
Coordinates:
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
    forecast_reference_time  datetime64[ns] ...
    height                   float64 ...
    forecast_period          (time) timedelta64[ns] ...
Attributes:
    standard_name:  air_temperature
    units:          K
    source:         Data from Met Office Unified Model
    STASH:          m01s03i236
    cell_methods:   time: mean (interval: 1 hour)

In [21]:
da = da.rename('air_temperature_daily')
da

<xarray.DataArray 'air_temperature_daily' (time: 10, grid_latitude: 219, grid_longitude: 286)>
dask.array<filled, shape=(10, 219, 286), dtype=float32, chunksize=(1, 219, 286), chunktype=numpy.ndarray>
Coordinates:
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
    forecast_reference_time  datetime64[ns] ...
    height                   float64 ...
    forecast_period          (time) timedelta64[ns] ...
Attributes:
    standard_name:  air_temperature
    units:          K
    source:         Data from Met Office Unified Model
    STASH:          m01s03i236
    cell_methods:   time: mean (interval: 1 hour)

In [24]:
cube_ = da.to_iris()
cube_

Air Temperature (K),time,grid_latitude,grid_longitude
Shape,10,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [30]:
print(cube.standard_name)
print(cube.var_name)
print(cube.long_name)

air_temperature
None
None


## OK, then what happens if I write a renamed xr.DataArray to zarr?

In [31]:
da

<xarray.DataArray 'air_temperature_daily' (time: 10, grid_latitude: 219, grid_longitude: 286)>
dask.array<filled, shape=(10, 219, 286), dtype=float32, chunksize=(1, 219, 286), chunktype=numpy.ndarray>
Coordinates:
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
    forecast_reference_time  datetime64[ns] ...
    height                   float64 ...
    forecast_period          (time) timedelta64[ns] ...
Attributes:
    standard_name:  air_temperature
    units:          K
    source:         Data from Met Office Unified Model
    STASH:          m01s03i236
    cell_methods:   time: mean (interval: 1 hour)

In [32]:
ds = da.to_dataset()
ds

<xarray.Dataset>
Dimensions:                  (grid_latitude: 219, grid_longitude: 286, time: 10)
Coordinates:
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
    forecast_reference_time  datetime64[ns] ...
    height                   float64 ...
    forecast_period          (time) timedelta64[ns] ...
Data variables:
    air_temperature_daily    (time, grid_latitude, grid_longitude) float32 dask.array<chunksize=(1, 219, 286), meta=np.ndarray>

In [34]:
zarr_store = '../zarr_rename'
crd.ds_to_zarr(ds, zarr_store)

Written dataset to ../zarr_rename


In [36]:
ds_z = xr.open_zarr(zarr_store)
ds_z

<xarray.Dataset>
Dimensions:                  (grid_latitude: 219, grid_longitude: 286, time: 10)
Coordinates:
    forecast_period          (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time  datetime64[ns] ...
  * grid_latitude            (grid_latitude) float32 22.88 22.66 ... -25.08
  * grid_longitude           (grid_longitude) float32 323.48 323.7 ... 386.18002
    height                   float64 ...
  * time                     (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
Data variables:
    air_temperature_daily    (time, grid_latitude, grid_longitude) float32 dask.array<chunksize=(10, 219, 286), meta=np.ndarray>

In [39]:
cube_z = ds_z.air_temperature_daily.to_iris()
print(cube_z.standard_name)
print(cube_z.var_name)
print(cube_z.long_name)
print(cube_z.name())
display(cube_z)

air_temperature
air_temperature_daily
None
air_temperature


Air Temperature (K),time,grid_latitude,grid_longitude
Shape,10,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [38]:
cube_z.name()

'air_temperature'

### Renaming in Xarray, saving to Zarr, then loading back into Xarray then Iris works without losing metadata or CF names
Therefore a strategy of renaming the dims and cubes before saving to zarr using xarray looks like it'll work