# Create Zarr of 9 years of monthly CSSP China data [1851-1889]

In [1]:
import iris
import os
import sys
import logging
import xarray as xr
import numpy as np

import crd_utils as crd
import umdates_utils as um

from datetime import datetime, timedelta

## Create a list of all the files we want to process

In [2]:
# monthly data filenames
filepath = '/data/cssp-china/mini-dataset-24-01-19/20CR/monthly'
files = sorted(os.listdir(filepath))
print(len(files))

1920


In [8]:
# generate all possible filenames for the time period
runid = 'apepd'
startd = datetime(1851, 1, 1)   # 00Z on Jan 01 1851
endd = datetime(1890, 1, 1)     # 00Z on Jan 01 1860
freq = 'pm'

decade_filenames = um.UMFileList(runid, startd, endd, freq)
print(len(decade_filenames))

469


In [9]:
# find the overlap of actual filenames with all possible filenames in that decade
filenames = list(set(files).intersection(set(decade_filenames)))
filenames.sort()
print(len(filenames))

469


In [10]:
filepaths = [os.path.join(filepath, filename) for filename in filenames]
print(len(filepaths))

469


## Set all the Cube, Dataset and Zarr variables we need to process the cubes

In [11]:
%%time
cubelist = iris.load(filepaths[0:1])
cubelist[0]

CPU times: user 505 ms, sys: 49.4 ms, total: 554 ms
Wall time: 591 ms


Air Pressure At Sea Level (Pa),grid_latitude,grid_longitude
Shape,219,286
Dimension coordinates,,
grid_latitude,x,-
grid_longitude,-,x
Scalar coordinates,,
forecast_period,"12020.0 hours, bound=(11660.0, 12380.0) hours","12020.0 hours, bound=(11660.0, 12380.0) hours"
forecast_reference_time,1849-12-01 04:00:00,1849-12-01 04:00:00
time,"1851-04-16 00:00:00, bound=(1851-04-01 00:00:00, 1851-05-01 00:00:00)","1851-04-16 00:00:00, bound=(1851-04-01 00:00:00, 1851-05-01 00:00:00)"
Attributes,,
STASH,m01s16i222,m01s16i222


In [12]:
print(cubelist)

0: air_pressure_at_sea_level / (Pa)    (grid_latitude: 219; grid_longitude: 286)
1: air_temperature / (K)               (grid_latitude: 219; grid_longitude: 286)
2: air_temperature / (K)               (grid_latitude: 219; grid_longitude: 286)
3: air_temperature / (K)               (grid_latitude: 219; grid_longitude: 286)
4: air_temperature / (K)               (pressure: 17; grid_latitude: 219; grid_longitude: 286)
5: cloud_area_fraction / (1)           (grid_latitude: 219; grid_longitude: 286)
6: geopotential_height / (m)           (pressure: 17; grid_latitude: 219; grid_longitude: 286)
7: lagrangian_tendency_of_air_pressure / (Pa s-1) (pressure: 17; grid_latitude: 218; grid_longitude: 286)
8: precipitation_flux / (kg m-2 s-1)   (grid_latitude: 219; grid_longitude: 286)
9: relative_humidity / (%)             (grid_latitude: 219; grid_longitude: 286)
10: relative_humidity / (%)             (pressure: 17; grid_latitude: 219; grid_longitude: 286)
11: specific_humidity / (1)             (

In [13]:
CUBENAMES = [cube.name() for cube in cubelist]
UNIQUE_COORDS = crd.unique_coords_list(cubelist)
COORD_NAME_MAPPING = crd.get_new_coord_names(UNIQUE_COORDS)

In [14]:
print([coord.name() for coord in UNIQUE_COORDS])

['grid_latitude', 'grid_longitude', 'forecast_period', 'forecast_reference_time', 'time', 'forecast_period', 'forecast_reference_time', 'height', 'pressure', 'forecast_period', 'forecast_reference_time', 'grid_latitude', 'grid_longitude', 'height']


In [15]:
COORD_NAME_MAPPING[1]

('forecast_period_1',
 'forecast_reference_time_1',
 'forecast_period_2',
 'forecast_reference_time_2',
 'grid_latitude_1',
 'grid_longitude_1',
 'height_1')

In [16]:
ZARR = '/data/cssp-china/zarr_monthly_1851-1889'
CHUNKS = {'time': 200, 'grid_latitude': 219, 'grid_longitude': 286, 'grid_latitude_1': 218, 'grid_longitude_1': 286}

## Initialise logging

In [17]:
LOGFILE = '/data/cssp-china/zarr_append_monthly.log'
logging.basicConfig(filename=LOGFILE,
                    level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%d/%m/%Y %H:%M:%S')

In [18]:
if not os.path.isfile(LOGFILE):
    os.mknod(LOGFILE)

if os.stat(LOGFILE).st_size == 0:
    logging.info('Initiate log')
    print(f'Log initiated at {LOGFILE}')

Log initiated at /data/cssp-china/zarr_append_monthly.log


## Create a Zarr with 3 cubes, which definitely amounts to 577 time steps (~25 days)

In [19]:
%%time
cubelist0 = iris.load(filepaths[0:3])
cubelist0[1]

CPU times: user 1.46 s, sys: 119 ms, total: 1.58 s
Wall time: 2.34 s


Air Temperature (K),time,grid_latitude,grid_longitude
Shape,3,219,286
Dimension coordinates,,,
time,x,-,-
grid_latitude,-,x,-
grid_longitude,-,-,x
Auxiliary coordinates,,,
forecast_period,x,-,-
Scalar coordinates,,,
forecast_reference_time,1849-12-01 00:00:00,1849-12-01 00:00:00,1849-12-01 00:00:00
height,1.5 m,1.5 m,1.5 m


In [20]:
print(cubelist0)

0: air_pressure_at_sea_level / (Pa)    (time: 3; grid_latitude: 219; grid_longitude: 286)
1: air_temperature / (K)               (time: 3; grid_latitude: 219; grid_longitude: 286)
2: air_temperature / (K)               (time: 3; grid_latitude: 219; grid_longitude: 286)
3: air_temperature / (K)               (time: 3; grid_latitude: 219; grid_longitude: 286)
4: air_temperature / (K)               (time: 3; pressure: 17; grid_latitude: 219; grid_longitude: 286)
5: cloud_area_fraction / (1)           (time: 3; grid_latitude: 219; grid_longitude: 286)
6: geopotential_height / (m)           (time: 3; pressure: 17; grid_latitude: 219; grid_longitude: 286)
7: lagrangian_tendency_of_air_pressure / (Pa s-1) (time: 3; pressure: 17; grid_latitude: 218; grid_longitude: 286)
8: precipitation_flux / (kg m-2 s-1)   (time: 3; grid_latitude: 219; grid_longitude: 286)
9: relative_humidity / (%)             (time: 3; grid_latitude: 219; grid_longitude: 286)
10: relative_humidity / (%)             (time: 

In [21]:
%%time
crd.rename_cubes(cubelist0, CUBENAMES, COORD_NAME_MAPPING, dryrun=False)

air_pressure_at_sea_level
  x time
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
air_temperature -> air_temperature_at_2m_mean
  x time
  x grid_latitude
  x grid_longitude
    forecast_reference_time_1
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m
  x time
  x grid_latitude
  x grid_longitude
    forecast_reference_time_1
  x height
  x forecast_period
air_temperature -> air_temperature_at_2m
  x time
  x grid_latitude
  x grid_longitude
    forecast_reference_time_1
  x height
  x forecast_period
air_temperature -> air_temperature_at_pressure_mean
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
cloud_area_fraction
  x time
  x grid_latitude
  x grid_longitude
    forecast_reference_time_2
  x forecast_period
geopotential_height
  x time
  x pressure
  x grid_latitude
  x grid_longitude
  x forecast_reference_time
  x forecast_period
lagrangian_tendency_of_air_pr

In [22]:
%%time
dalist0 = crd.cubelist_to_dalist(cubelist0)
ds0 = xr.merge(dalist0)

MergeError: conflicting values for variable 'forecast_period' on objects to be combined. You can skip this check by specifying compat='override'.

## BREAK: Conflicts with merging into a Dataset

--------

In [None]:
ds0

In [None]:
%%time
logging.info(f'Creating {ZARR}')
crd.ds_to_zarr(ds0, ZARR, chunks=CHUNKS)

In [None]:
dsz0 = xr.open_zarr(ZARR)
dsz0

In [None]:
# Size of Dataset in GB
dsz0.nbytes / 1e9

In [None]:
logging.info(f'  Successfull creation of {ZARR}')
logging.info(f'  Processed filenames 0:{filenames[0]} - 3:{filenames[3]}')
logging.info(f'  Chunking {CHUNKS}')

## Now loop through the remaining cubes and append to the Zarr we created

In [None]:
def logprint(message):
    logging.info(message)
    print(message)

In [None]:
%%time
# Loop through in bunches of 2 cubes, so as to not use to much memory at one time
start = 7
step = 2
stop = len(filepaths)
# stop = start+(2*step)

logprint(f'Appending data to {ZARR}')

for i in range(start, stop, step):
    fnames = filenames[i:i+step]
    logprint(f'  Processing filenames {i}:{fnames[0]} - {min([i+step, stop])}:{fnames[-1]}')
    try:
        cubelist = iris.load(filepaths[i:i+step])
        logprint(f'  | Loaded files successfully')
        
        crd.rename_cubes(cubelist, CUBENAMES, COORD_NAME_MAPPING, dryrun=False, verbose=False)
        logprint(f'  | Renamed files successfully')
        
        dalist = crd.cubelist_to_dalist(cubelist)
        ds = xr.merge(dalist)
        logprint(f'  | Created dataset successfully')
        
        crd.ds_to_zarr(ds, ZARR, chunks=CHUNKS)
        logprint(f'  | Appended to Zarr {ZARR} successfully')
        
    except Exception as e:
        logprint(f'  X ERROR: {e}')
        raise e

logprint(f'  Appending data complete')

In [None]:
dsz1 = xr.open_zarr(ZARR)
dsz1

In [None]:
cubez = dsz1.surface_air_pressure.to_iris()
cubez

In [None]:
deltas = cubez.coord('time').points[1:]-cubez.coord('time').points[0:-1]
deltas

In [None]:
list(deltas).index(240)