# Attempt full pipeline
Including:
1. Loading a .pp file
2. Renaming any repeated coords
3. Renaming any repeated cubes
4. Writing to Zarr using Xarray
5. Reading Zarr back using Xarray and pulling out Iris cube

In [41]:
import iris
import os
import xarray as xr
import numpy as np

import copy

import crd_utils as crd

In [42]:
def unique_coords_list(cubelist):
    unique = []
    for cube in cubelist:
        for coord in cube.coords():
            if not coord in unique:
                unique.append(coord)
    return copy.deepcopy(unique)

def get_new_coord_names(coords, verbose=False):
    names = [None]*len(coords)
    renamed = []
    for i, coord in enumerate(coords):
        name = coord.name()
        if verbose:
            print(f'Names: {names}')
            print(f'Name: {name} is in names = {name in names}')
        if name in names:
            n = sum(1 for val in names if val is not None and val.startswith(name))
            new_name = f'{name}_{n}'
            names[i] = new_name
            renamed.append((coord, new_name))
        else:
            names[i] = name
    print(f'Names: {names}')
    return tuple(zip(*renamed))

def rename_unknown_cubes(cubelist):
    for cube in cubelist:
        if cube.standard_name == None:
            cube.var_name = cube.name()
    return cubelist

In [54]:
def rename_cubes(cubelist, cubenames, renamed_coords, dryrun=False):
    for cube in cubelist:
        print(f'{cube.name()}')
        
        # Rename coords
        for coord in cube.coords():
            if coord in renamed_coords[0]:
                name = renamed_coords[1][renamed_coords[0].index(coord)]    #Does .index() behave how we wish?
                if not dryrun:
                    coord.var_name = name
                print(f'    {name}')
            else:
                print(f'  x {coord.name()}')
        
        # Rename cube if duplicate or unknown
        if cube.standard_name == None or cubenames.count(cube.name()) > 1:
            new_name = get_new_cubename(cube)
            print(f'    Cube renamed {cube.name()} -> {new_name}')
            cube.var_name = new_name


In [44]:
def cubelist_to_dalist(cubelist):
    dalist = []
    for cube in cubelist:
        dalist.append(xr.DataArray.from_iris(cube))
    return dalist

In [81]:
def get_new_cubename(cube):
    suffixes = [cube.standard_name or str(cube.attributes['STASH'])]  # cube.name() leads to repeated cell_method suffixes for anonymous cubes
    coord_names = [coord.name() for coord in cube.coords()]
    
    if 'pressure' in coord_names:
        suffixes.append('at_pressure')
    
    if 'height' in coord_names:
        heights = cube.coord('height')
        if len(heights.points) > 1:
            suffixes.append('at_height')
        else:
            height = str(int(heights.points[0].round()))
            units = str(heights.units)
            suffixes.append(f'at_{height}{units}')
    
    for cell_method in cube.cell_methods:
        method = cell_method.method.replace('imum', '')
        suffixes.append(method)
    
    return '_'.join(suffixes)

# CUBENAMES = (cube.name() for cube in cubelist)

# for cube in cubelist:
#     if cube.standard_name == None or CUBENAMES.count(cube.name()) > 1:
#         new_name = get_new_cubename(cube)
#         cube.var_name = new_name
#         display(new_name)

In [46]:
# daily data
filepath_d = '/data/cssp-china/mini-dataset-24-01-19/20CR/daily'
files_d = sorted(os.listdir(filepath_d))
cubelist_d = crd.file_to_cubelist(files_d[1], filepath_d)
print(f'Number of cubes: {len(cubelist_d)}')

Cubelist loaded from apepda.pa511f0.pp
Number of cubes: 24


In [47]:
unique_coords_d = unique_coords_list(cubelist_d)
print(f'Number of unique coords in daily data: {len(unique_coords_d)}')
for coord in unique_coords_d:
    print(f'    {coord.name()}')

Number of unique coords in daily data: 10
    time
    grid_latitude
    grid_longitude
    forecast_reference_time
    forecast_period
    height
    pressure
    grid_latitude
    grid_longitude
    height


In [52]:
CUBENAMES = [cube.name() for cube in cubelist_d]

In [48]:
UNIVERSAL_COORD_MAPPING = get_new_coord_names(unique_coords_d)
UNIVERSAL_COORD_MAPPING[1]

Names: ['time', 'grid_latitude', 'grid_longitude', 'forecast_reference_time', 'forecast_period', 'height', 'pressure', 'grid_latitude_1', 'grid_longitude_1', 'height_1']


('grid_latitude_1', 'grid_longitude_1', 'height_1')

In [55]:
rename_cubes(cubelist_d, CUBENAMES, UNIVERSAL_COORD_MAPPING, dryrun=True)

m01s05i217_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
    Cube renamed m01s05i217_max -> m01s05i217_max_max
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_mean
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_max
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_min
air_temperature
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
    Cube renamed air_temperature -> air_temp

In [56]:
u_coords_1 = unique_coords_list(cubelist_d)
print(len(u_coords_1))
for coord in u_coords_1:
    print(coord.name())
    print(f'    {coord.standard_name}')
    print(f'    {coord.var_name}')

10
time
    time
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
forecast_reference_time
    forecast_reference_time
    None
forecast_period
    forecast_period
    None
height
    height
    None
pressure
    None
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
height
    height
    None


In [58]:
rename_cubes(cubelist_d, CUBENAMES, UNIVERSAL_COORD_MAPPING, dryrun=False)

m01s05i217_max_max
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
    Cube renamed m01s05i217_max_max -> m01s05i217_max_max_max
air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_mean
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_max
air_temperature
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x height
  x forecast_period
    Cube renamed air_temperature -> air_temperature_at_2m_min
air_temperature
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
    Cube renamed air_temperature

In [59]:
u_coords_2 = unique_coords_list(cubelist_d)
print(len(u_coords_2))
for coord in u_coords_2:
    print(coord.name())
    print(f'    {coord.standard_name}')
    print(f'    {coord.var_name}')

10
time
    time
    None
grid_latitude
    grid_latitude
    None
grid_longitude
    grid_longitude
    None
forecast_reference_time
    forecast_reference_time
    None
forecast_period
    forecast_period
    None
height
    height
    None
pressure
    None
    None
grid_latitude
    grid_latitude
    grid_latitude_1
grid_longitude
    grid_longitude
    grid_longitude_1
height
    height
    height_1


In [60]:
dalist = cubelist_to_dalist(cubelist_d)
ds = xr.merge(dalist)
ds

<xarray.Dataset>
Dimensions:                                    (grid_latitude: 219, grid_latitude_1: 218, grid_longitude: 286, grid_longitude_1: 286, pressure: 14, time: 10)
Coordinates:
  * time                                       (time) datetime64[ns] 1851-01-05T12:00:00 ... 1851-01-14T12:00:00
  * grid_latitude                              (grid_latitude) float32 22.88 ... -25.08
  * grid_longitude                             (grid_longitude) float32 323.48 ... 386.18002
    forecast_reference_time                    datetime64[ns] 1849-12-01
    forecast_period                            (time) timedelta64[ns] 400 days 12:00:00 ... 409 days 12:00:00
    height                                     float64 1.5
  * pressure                                   (pressure) float32 10.0 ... 1000.0
  * grid_latitude_1                            (grid_latitude_1) float32 22.77 ... -24.969997
  * grid_longitude_1                           (grid_longitude_1) float32 323.59003 ... 386.29004
  

In [62]:
zarr = '../zarr_append_multidim_2'
crd.ds_to_zarr(ds, zarr)
dsz = xr.open_zarr(zarr)
dsz

Written dataset to ../zarr_append_multidim_2


<xarray.Dataset>
Dimensions:                                    (grid_latitude: 219, grid_latitude_1: 218, grid_longitude: 286, grid_longitude_1: 286, pressure: 14, time: 10)
Coordinates:
    forecast_period                            (time) timedelta64[ns] dask.array<chunksize=(10,), meta=np.ndarray>
    forecast_reference_time                    datetime64[ns] ...
  * grid_latitude                              (grid_latitude) float32 22.88 ... -25.08
  * grid_latitude_1                            (grid_latitude_1) float32 22.77 ... -24.969997
  * grid_longitude                             (grid_longitude) float32 323.48 ... 386.18002
  * grid_longitude_1                           (grid_longitude_1) float32 323.59003 ... 386.29004
    height                                     float64 ...
    height_1                                   float64 ...
  * pressure                                   (pressure) float32 10.0 ... 1000.0
  * time                                       (time) date

In [84]:
dsz.air_temperature_at_2m_mean.to_iris()

Air Temperature (K),time,grid_latitude,grid_longitude
Shape,10,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [85]:
dsz.y_wind_at_pressure_mean.to_iris()

Y Wind (m s-1),time,pressure,grid_latitude,grid_longitude
Shape,10,14,218,286
Dimension coordinates,,,,
time,x,-,-,-
pressure,-,x,-,-
grid_latitude,-,-,x,-
grid_longitude,-,-,-,x
Auxiliary coordinates,,,,
forecast_period,x,-,-,-
Scalar coordinates,,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00


## Renaming of cubes and coords so that they can be merged into a single `xr.Dataset` was a success!
#### Only one bug with `height` scalar coords being attributed to too many data variables