In [1]:
import xarray as xr
import rioxarray as rxr
import shutil
import os
from math import ceil
from tqdm import tqdm

class ChunkWriter():
    def __init__(self, tiff_path: str, save_path: str, chunk_size: int = 100):
        self.da = rxr.open_rasterio(tiff_path)
        self.save_path = save_path
        self.chunk_size = chunk_size

        self.num_y = len(self.da.y)

        self.current_chunk = 0

    def __iter__(self):
        if os.path.exists(self.save_path):
            shutil.rmtree(self.save_path)

        return self

    def __next__(self):
        self.write_chunk(self.current_chunk)

        if (self.current_chunk * self.chunk_size) > self.num_y:
            raise StopIteration
 
        self.current_chunk += 1

    def __len__(self) -> int:
        return ceil(self.num_y / self.chunk_size)

    def write_chunk(self, chunk: int):

        da = self.da.isel(y=slice(chunk * self.chunk_size, (chunk + 1) * self.chunk_size)).load()

        ds = xr.Dataset(
            {
                'rs': da.sel(band=[1, 2, 3, 4]),
                'mask': da.sel(band=5),
                'label': da.sel(band=6),
            }
        )

        ds = ds.transpose('band', 'x', 'y')

        category_dict_reversed = {
            0: "ConstructionSite",
            1: "Building",
            2: "BuildingDistortion",
            3: "GreenAreas",
            4: "RoadAsphalt",
            5: "Forest",
            6: "WaterBasin",
            7: "Path",
            8: "MeadowPasture",
            9: "SealedObjects"
        }

        # Dataset attributes
        ds.attrs = {'creator': 'Julian Kraft'}

        # Variable attributes
        ds['rs'].attrs.update({'source': 'SwissImage RS'})
        ds['label'].attrs.update({'classes': ', '.join([f'{k}={v}' for k, v in category_dict_reversed.items()])})

        ds['rs'] = ds.rs.chunk({'band': 4, 'x': self.chunk_size, 'y': self.chunk_size})
        ds['label'] = ds.label.chunk({'x': self.chunk_size, 'y': self.chunk_size})
        ds['mask'] = ds.mask.chunk({'x': -1, 'y': -1})


        encoding = {}

        for variable in ds.data_vars:
            encoding[variable] = {'compressor': None}

            if variable == 'rs':
                ds[variable] = ds[variable].astype('uint16')

            else:
                ds[variable] = ds[variable].where(ds[variable] != -9999, 255).astype('uint8')

            ds[variable].attrs = {}

        if chunk == 0:
            kwargs = {}
        else:
            for variable in ds.data_vars:
                ds[variable].attrs = {}
            kwargs = {'append_dim': 'y'}

        ds.to_zarr(self.save_path, mode='a', **kwargs)

    def write(self, dev_mode: bool = False):
        for i, _ in enumerate(tqdm(self, ncols=80, desc='Writing chunks')):
            if dev_mode and (i > 1):
                raise StopIteration


In [2]:
chunkwriter = ChunkWriter(
    tiff_path='../data/CombinedData_32signed/CombinedData32signed.tif',
    save_path='../data/combined.zarr',
    chunk_size=500)
chunkwriter.write()

Writing chunks: 100%|███████████████████████████| 84/84 [02:51<00:00,  2.04s/it]


In [79]:
xr.open_zarr('../data/combined.zarr')

Unnamed: 0,Array,Chunk
Bytes,45.51 MiB,244.14 kiB
Shape,"(31815, 1500)","(500, 500)"
Dask graph,192 chunks in 2 graph layers,192 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 45.51 MiB 244.14 kiB Shape (31815, 1500) (500, 500) Dask graph 192 chunks in 2 graph layers Data type uint8 numpy.ndarray",1500  31815,

Unnamed: 0,Array,Chunk
Bytes,45.51 MiB,244.14 kiB
Shape,"(31815, 1500)","(500, 500)"
Dask graph,192 chunks in 2 graph layers,192 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,45.51 MiB,15.17 MiB
Shape,"(31815, 1500)","(31815, 500)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 45.51 MiB 15.17 MiB Shape (31815, 1500) (31815, 500) Dask graph 3 chunks in 2 graph layers Data type uint8 numpy.ndarray",1500  31815,

Unnamed: 0,Array,Chunk
Bytes,45.51 MiB,15.17 MiB
Shape,"(31815, 1500)","(31815, 500)"
Dask graph,3 chunks in 2 graph layers,3 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,364.09 MiB,1.91 MiB
Shape,"(4, 31815, 1500)","(4, 500, 500)"
Dask graph,192 chunks in 2 graph layers,192 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray
"Array Chunk Bytes 364.09 MiB 1.91 MiB Shape (4, 31815, 1500) (4, 500, 500) Dask graph 192 chunks in 2 graph layers Data type uint16 numpy.ndarray",1500  31815  4,

Unnamed: 0,Array,Chunk
Bytes,364.09 MiB,1.91 MiB
Shape,"(4, 31815, 1500)","(4, 500, 500)"
Dask graph,192 chunks in 2 graph layers,192 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray


In [25]:
write_chunk(0)



In [26]:
write_chunk(1)



In [27]:
xr.open_zarr('../data/combined.zarr')

Unnamed: 0,Array,Chunk
Bytes,48.55 MiB,78.12 kiB
Shape,"(31815, 200)","(100, 100)"
Dask graph,638 chunks in 2 graph layers,638 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 48.55 MiB 78.12 kiB Shape (31815, 200) (100, 100) Dask graph 638 chunks in 2 graph layers Data type float64 numpy.ndarray",200  31815,

Unnamed: 0,Array,Chunk
Bytes,48.55 MiB,78.12 kiB
Shape,"(31815, 200)","(100, 100)"
Dask graph,638 chunks in 2 graph layers,638 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.55 MiB,24.27 MiB
Shape,"(31815, 200)","(31815, 100)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 48.55 MiB 24.27 MiB Shape (31815, 200) (31815, 100) Dask graph 2 chunks in 2 graph layers Data type float64 numpy.ndarray",200  31815,

Unnamed: 0,Array,Chunk
Bytes,48.55 MiB,24.27 MiB
Shape,"(31815, 200)","(31815, 100)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,194.18 MiB,312.50 kiB
Shape,"(4, 31815, 200)","(4, 100, 100)"
Dask graph,638 chunks in 2 graph layers,638 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 194.18 MiB 312.50 kiB Shape (4, 31815, 200) (4, 100, 100) Dask graph 638 chunks in 2 graph layers Data type float64 numpy.ndarray",200  31815  4,

Unnamed: 0,Array,Chunk
Bytes,194.18 MiB,312.50 kiB
Shape,"(4, 31815, 200)","(4, 100, 100)"
Dask graph,638 chunks in 2 graph layers,638 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [8]:
da = rxr.open_rasterio('../data/CombinedData_32signed/CombinedData32signed.tif')



In [10]:
len(da.y)

41651

<xarray.backends.zarr.ZarrStore at 0x3106e63c0>

In [29]:
rs_subset.attrs = {}

In [30]:
rs_subset.to_zarr('../data/combined.zarr', mode='a', region={'x': slice(0, 100), 'y': slice(0, 100)})

ValueError: when setting `region` explicitly in to_zarr(), all variables in the dataset to write must have at least one dimension in common with the region's dimensions ['x', 'y'], but that is not the case for some variables here. To drop these variables from this dataset before exporting to zarr, write: .drop_vars(['band', 'spatial_ref'])

In [5]:
ds.to_zarr('../data/combined.zarr', encoding=encoding, mode='w', compute=False)

Delayed('_finalize_store-bad6da60-117e-4e58-bc69-c5323d55aaf9')

In [None]:


with ProgressBar():
    ds.to_zarr('../data/combined.zarr', encoding=encoding, mode='w')



[########################################] | 100% Completed | 425.35 ms


In [None]:
ds = xr.open_zarr('../data/combined.zarr')

<xarray.backends.zarr.ZarrStore at 0x3076a0ec0>