# Attempt full pipeline
Including:
1. Loading a .pp file
2. Renaming any repeated coords
3. Renaming any repeated cubes
4. Writing to Zarr using Xarray
5. Reading Zarr back using Xarray and pulling out Iris cube

In [1]:
import iris
import os
import xarray as xr
import numpy as np

import copy

import crd_utils as crd

In [2]:
def unique_coords_list(cubelist):
    unique = []
    for cube in cubelist:
        for coord in cube.coords():
            if not coord in unique:
                unique.append(coord)
    return copy.deepcopy(unique)

def get_new_coord_names(coords, verbose=False):
    names = []
    renamed = []
    for coord in coords:
        name = coord.name()
        names.append(name)
        n = names.count(name)
        if n > 1:
            new_name = f'{name}_{n-1}'
            renamed.append((coord, new_name))
        if verbose:
            print(f'Names: {names}')
    if verbose:
        print(f'Names: {names}')
    return tuple(zip(*renamed))

def get_new_cubename(cube):
    suffixes = [cube.standard_name or str(cube.attributes['STASH'])]  
        # cube.name() leads to repeated cell_method suffixes for anonymous cubes
    coord_names = [coord.name() for coord in cube.coords()]
    
    if 'pressure' in coord_names:
        suffixes.append('at_pressure')
    
    if 'height' in coord_names:
        heights = cube.coord('height')
        if len(heights.points) > 1:
            suffixes.append('at_height')
        else:
            height = str(int(heights.points[0].round()))
            units = str(heights.units)
            suffixes.append(f'at_{height}{units}')
    
    for cell_method in cube.cell_methods:
        method = cell_method.method.replace('imum', '')
        suffixes.append(method)
    
    return '_'.join(suffixes)

In [6]:
def rename_cubes(cubelist, cubenames=None, new_coordnames=None, dryrun=False, verbose=True):
    '''
    Rename cubes and coordinates in place where necessary
    '''
    if cubenames==None:
        cubenames = [cube.name() for cube in cubelist]
        
    if new_coordnames==None:
        new_coordnames = get_new_coord_names(unique_coords_list(cubelist))
    
    for cube in cubelist:
        # Rename cube if duplicate or unknown
        if cube.standard_name == None or cubenames.count(cube.name()) > 1:
            new_name = get_new_cubename(cube)
            if dryrun or verbose:
                print(f'{cube.name()} -> {new_name}')
            if not dryrun:
                cube.var_name = new_name
        elif dryrun or verbose:
            print(f'{cube.name()}')
        
        # Rename coords
        for coord in cube.coords():
            if coord in new_coordnames[0]:
                new_name = new_coordnames[1][new_coordnames[0].index(coord)]
                if not dryrun:
                    coord.var_name = new_name
                if dryrun or verbose:
                    print(f'    {new_name}')
            elif dryrun or verbose:
                print(f'  x {coord.name()}')

In [7]:
def cubelist_to_dalist(cubelist):
    dalist = []
    for cube in cubelist:
        dalist.append(xr.DataArray.from_iris(cube))
    return dalist

## Load data
Daily to start with

In [22]:
# daily data
filepath_d = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files_d = sorted(os.listdir(filepath_d))
cubelist_d = crd.file_to_cubelist(files_d[1], filepath_d)
print(f'Number of cubes: {len(cubelist_d)}')

Cubelist loaded from apepda.pa511f0.pp
Number of cubes: 24


In [9]:
unique_coords_d = unique_coords_list(cubelist_d)
print(f'Number of unique coords in daily data: {len(unique_coords_d)}')
for coord in unique_coords_d:
    print(f'    {coord.name()}')

Number of unique coords in daily data: 10
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    height
    pressure
    grid_latitude
    grid_longitude
    height


In [10]:
CUBENAMES = [cube.name() for cube in cubelist_d]

In [11]:
COORD_NAME_MAPPING = get_new_coord_names(unique_coords_d)
COORD_NAME_MAPPING[1]

('grid_latitude_1', 'grid_longitude_1', 'height_1')

In [23]:
%%time
rename_cubes(cubelist_d, CUBENAMES, COORD_NAME_MAPPING, dryrun=True, verbose=False)

m01s05i217 -> m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_min
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
geopotential_height
  x time
  x 

In [24]:
%%time
rename_cubes(cubelist_d, dryrun=True, verbose=False)

m01s05i217 -> m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_min
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
geopotential_height
  x time
  x 

In [14]:
u_coords_1 = unique_coords_list(cubelist_d)
print(len(u_coords_1))
for coord in u_coords_1:
    print(coord.name())
    print(f'    {coord.standard_name}')
    print(f'    {coord.var_name}')

10
time
    time
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
forecast_reference_time
    forecast_reference_time
    None
forecast_period
    forecast_period
    None
height
    height
    None
pressure
    None
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
height
    height
    None


In [15]:
rename_cubes(cubelist_d, CUBENAMES, COORD_NAME_MAPPING, dryrun=False)

m01s05i217 -> m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m_min
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
geopotential_height
  x time
  x 

In [16]:
u_coords_2 = unique_coords_list(cubelist_d)
print(len(u_coords_2))
for coord in u_coords_2:
    print(coord.name())
    print(f'    {coord.standard_name}')
    print(f'    {coord.var_name}')

10
time
    time
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
forecast_reference_time
    forecast_reference_time
    None
forecast_period
    forecast_period
    None
height
    height
    None
pressure
    None
    None
grid_latitude
    grid_latitude
    grid_latitude_1
grid_longitude
    grid_longitude
    grid_longitude_1
height
    height
    height_1


In [17]:
dalist = cubelist_to_dalist(cubelist_d)
ds = xr.merge(dalist)
ds

<xarray.Dataset>
Dimensions:                                    (grid_latitude: 219, grid_latitude_1: 218, grid_longitude: 286, grid_longitude_1: 286, pressure: 14, time: 10)
Coordinates:
  * time                                       (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude                              (grid_latitude) float32 22.88 ... -25.08
  * grid_longitude                             (grid_longitude) float32 323.48 ... 386.18002
    forecast_reference_time                    datetime64[ns] 1849-12-01
    forecast_period                            (time) timedelta64[ns] 400 days 12:00:00 ... 409 days 12:00:00
    height                                     float64 1.5
  * pressure                                   (pressure) float32 10.0 ... 1000.0
  * grid_latitude_1                            (grid_latitude_1) float32 22.77 ... -24.969997
  * grid_longitude_1                           (grid_longitude_1) float32 323.59003 ... 386.29004
  

In [19]:
zarr = '../zarr_append_multidim_3'
crd.ds_to_zarr(ds, zarr)
dsz = xr.open_zarr(zarr)
dsz

Written dataset to ../zarr_append_multidim_3


<xarray.Dataset>
Dimensions:                                    (grid_latitude: 219, grid_latitude_1: 218, grid_longitude: 286, grid_longitude_1: 286, pressure: 14, time: 10)
Coordinates:
    forecast_period                            (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time                    datetime64[ns] ...
  * grid_latitude                              (grid_latitude) float32 22.88 ... -25.08
  * grid_latitude_1                            (grid_latitude_1) float32 22.77 ... -24.969997
  * grid_longitude                             (grid_longitude) float32 323.48 ... 386.18002
  * grid_longitude_1                           (grid_longitude_1) float32 323.59003 ... 386.29004
    height                                     float64 ...
    height_1                                   float64 ...
  * pressure                                   (pressure) float32 10.0 ... 1000.0
  * time                                       (time) date

In [20]:
dsz.air_temperature_at_2m_mean.to_iris()

Air Temperature (K),time,grid_latitude,grid_longitude
Shape,10,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [21]:
dsz.y_wind_at_pressure_mean.to_iris()

Y Wind (m s-1),time,pressure,grid_latitude,grid_longitude
Shape,10,14,218,286
Dimension coordinates,,,,
time,x,-,-,-
pressure,-,x,-,-
grid_latitude,-,-,x,-
grid_longitude,-,-,-,x
Auxiliary coordinates,,,,
forecast_period,x,-,-,-
Scalar coordinates,,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00


## Renaming of cubes and coords so that they can be merged into a single `xr.Dataset` was a success!
#### Only one bug with `height` scalar coords being attributed to too many data variables