# String encoding

In [1]:
import xarray as xr
import numpy as np
import warnings

In [2]:
!rm *.nc

## How should xarray serialize bytes/unicode strings across Python/netCDF versions? 
https://github.com/pydata/xarray/issues/2059

Related:

- Round trip converts text attributes to strings https://github.com/Unidata/netcdf4-python/issues/529
- to_netcdf() to automatically switch to fixed-length strings for compressed variables https://github.com/pydata/xarray/issues/2040

In [19]:
dt = np.dtype(object, metadata={"_Encoding": "ascii"})
s = np.array(["abc"], dtype=dt)
s.dtype.metadata

mappingproxy({'_Encoding': 'ascii'})

In [16]:
print(s.dtype.name)

object


In [5]:
from __future__ import print_function
import xarray as xr
import uuid
import netCDF4
import numpy as np
import sys

print(
    "| Python version | NetCDF version | NumPy datatype | NumPy datatype (xarray) | NetCDF datatype | Numpy datatype (read) |"
)
print("| --- | --- | --- | --- | --- | --- |")
for i, (dtype_name, value) in enumerate(
    [
        ("np.string_ / " + type(b"").__name__, np.array([b"abc"])),
        ("np.unicode_ / " + type("").__name__, np.array(["abc"])),
        ("object bytes/" + type(b"").__name__, np.array([b"abc"], dtype=object)),
        ("object unicode/" + type("").__name__, np.array(["abc"], dtype=object)),
    ]
):
    src_dtype = value.dtype
    src_dtype_kind = src_dtype.kind
    for format in ["NETCDF4_CLASSIC", "NETCDF4"]:
        filename = f"test-2059-{format}-{i}.nc"
        ds = xr.Dataset({"data": xr.DataArray(data=value)})
        ds.to_netcdf(filename, engine="netcdf4", format=format)
        with netCDF4.Dataset(filename) as f:
            var = f.variables["data"]
            disk_dtype = var.dtype
            has_encoding = hasattr(var, "_Encoding")
        with xr.open_dataset(filename) as ds:
            read_dtype = ds["data"].dtype
        disk_dtype_name = ("NC_CHAR" if disk_dtype == "S1" else "NC_STRING") + (
            " with UTF-8 encoding" if has_encoding else ""
        )
        print(
            "|",
            "Python %i" % sys.version_info[0],
            "|",
            format,
            "|",
            dtype_name,
            "|",
            src_dtype,
            src_dtype_kind,
            "|",
            disk_dtype_name,
            "|",
            f"\{read_dtype}",
            "|",
        )

| Python version | NetCDF version | NumPy datatype | NumPy datatype (xarray) | NetCDF datatype | Numpy datatype (read) |
| --- | --- | --- | --- | --- | --- |
| Python 3 | NETCDF4_CLASSIC | np.string_ / bytes | |S3 S | NC_CHAR | \|S3 |
| Python 3 | NETCDF4 | np.string_ / bytes | |S3 S | NC_CHAR | \|S3 |
| Python 3 | NETCDF4_CLASSIC | np.unicode_ / str | <U3 U | NC_CHAR with UTF-8 encoding | \object |
| Python 3 | NETCDF4 | np.unicode_ / str | <U3 U | NC_STRING | \<U3 |
| Python 3 | NETCDF4_CLASSIC | object bytes/bytes | object O | NC_CHAR | \|S3 |
| Python 3 | NETCDF4 | object bytes/bytes | object O | NC_CHAR | \|S3 |
| Python 3 | NETCDF4_CLASSIC | object unicode/str | object O | NC_CHAR with UTF-8 encoding | \object |
| Python 3 | NETCDF4 | object unicode/str | object O | NC_STRING | \<U3 |


| Python version | NetCDF version | NumPy datatype | NumPy datatype (xarray) | NetCDF datatype | Numpy datatype (read) |
| --- | --- | --- | --- | --- | --- |
| Python 3 | NETCDF4_CLASSIC | np.string_ / bytes | \|S3 | NC_CHAR | \|S3 |
| Python 3 | NETCDF4 | np.string_ / bytes | \|S3 | NC_CHAR | \|S3 |
| Python 3 | NETCDF4_CLASSIC | np.unicode_ / str | <U3 | NC_CHAR with UTF-8 encoding | object |
| Python 3 | NETCDF4 | np.unicode_ / str | <U3 | NC_STRING | \<U3 |
| Python 3 | NETCDF4_CLASSIC | object bytes/bytes | object | NC_CHAR | \|S3 |
| Python 3 | NETCDF4 | object bytes/bytes | object | NC_CHAR | \|S3 |
| Python 3 | NETCDF4_CLASSIC | object unicode/str | object | NC_CHAR with UTF-8 encoding | object |
| Python 3 | NETCDF4 | object unicode/str | object | NC_STRING | \<U3 |

In [None]:
def encode_nc3_variable(var):
    for coder in [
        xr.coding.strings.EncodedStringCoder(allows_unicode=False),
        xr.coding.strings.CharacterArrayCoder(),
    ]:
        var = coder.encode(var)
    data = xr.backends.netcdf3.coerce_nc3_dtype(var.data)
    attrs = xr.backends.netcdf3.encode_nc3_attrs(var.attrs)
    return xr.Variable(var.dims, data, attrs, var.encoding)

In [None]:
var.values
x = np.array([["a", "b", "c"]], dtype="|S1")
var

In [None]:
value = np.array(["abc"])
ds = xr.Dataset({"data": xr.DataArray(data=value)})
# var = ds["data"].variable
# print("################")
# print(var)
# var = encode_nc3_variable(var)
# print("################")
# var.attrs["_Encoding"] = "utf-8"
# print(var)
# print("################")
# ds["data"] = var
filename = "test_string.nc"
ds.to_netcdf(filename, engine="netcdf4", format="NETCDF4_CLASSIC")
!h5dump test_string.nc

In [None]:
!ncdump test-2059-NETCDF4-0-0.nc

In [None]:
#!h5dump --help

In [None]:
!h5dump test-2059-NETCDF4-0.nc
!h5dump test-2059-NETCDF4-1.nc
!h5dump test-2059-NETCDF4-2.nc
!h5dump test-2059-NETCDF4-3.nc

In [None]:
!h5dump -A 0 -d data test-2059-NETCDF4-1.nc

In [None]:
!ncdump test-2059-NETCDF4-3.nc

## Representing missing values in string arrays on disk

https://github.com/pydata/xarray/issues/1647

## Attributes encoding compatibility between backends

https://github.com/pydata/xarray/issues/5226

In [None]:
import xarray as xr
import zarr

g = zarr.group()
g.create("arr", shape=3, fill_value="z", dtype="<U1")
g["arr"].attrs["_ARRAY_DIMENSIONS"] = "dim_1"

# -- without masking fill values
ds = xr.open_zarr(g.store, mask_and_scale=False)

ds.arr.attrs  # returns {'_FillValue': 'z'}

# error: netCDF4 does not yet support setting a fill value for variable-length strings
ds.to_netcdf("test-5226-01.nc")

# -- with masking fill values
ds2 = xr.open_zarr(g.store, mask_and_scale=True)

# returns a dict that includes item _FillValue': 'z'
ds2.arr.encoding

# same error than above
ds2.to_netcdf("test-5226-02.nc")

## Make an xarray with an array of strings

In [None]:
dtype = None
data = np.array([["a", "b"], ["c", "d"]], dtype=dtype)
print(data.dtype)
da = xr.DataArray(
    data=data,
    dims=["x", "y"],
    coords={"x": [0, 1], "y": [0, 1]},
)
da.name = "strarr"
display(da)

### Write to engines with default values

In [None]:
engines = ["scipy", "netcdf4", "h5netcdf"]
[da.to_netcdf(f"test-{engine}.nc", mode="w", engine=engine) for engine in engines]

#### Observe read back

In [None]:
for engine in engines:
    print(f"write engine: {engine}")
    print("read engine: netcdf4")
    with xr.open_dataset(f"test-{engine}.nc", engine="netcdf4") as ds:
        display(ds.load())

#### Check with ncdump

In [None]:
!ncdump -v strarr test-scipy.nc

In [None]:
!ncdump -v strarr test-netcdf4.nc

In [None]:
!ncdump -v strarr test-h5netcdf.nc

#### Observe HDF5-based files with h5dump

In [None]:
!h5dump -d strarr test-netcdf4.nc

In [None]:
!h5dump -d strarr test-h5netcdf.nc

## Write with netcdf4 with different formats

In [None]:
engine = "netcdf4"
formats = ["NETCDF3_CLASSIC", "NETCDF3_64BIT", "NETCDF4_CLASSIC", "NETCDF4"]
[
    da.to_netcdf(f"test-{engine}-{fmt}.nc", mode="w", engine=engine, format=fmt)
    for fmt in formats
]

#### Observe Read back

In [None]:
for fmt in formats:
    print(fmt)
    with xr.open_dataset(f"test-netcdf4-{fmt}.nc") as ds:
        display(ds.load())

#### Check with ncdump

In [None]:
!ncdump -v strarr test-netcdf4-NETCDF3_CLASSIC.nc

In [None]:
!ncdump -v strarr test-netcdf4-NETCDF3_64BIT.nc

In [None]:
!ncdump -v strarr test-netcdf4-NETCDF4_CLASSIC.nc

In [None]:
!ncdump -v strarr test-netcdf4-NETCDF4.nc

#### Check HDF5 based with h5dump

In [None]:
!h5dump -d strarr test-netcdf4-NETCDF4_CLASSIC.nc

In [None]:
!h5dump -d strarr test-netcdf4-NETCDF4.nc

## Concat Characters (True by Default)

https://github.com/pydata/xarray/issues/4452

In [None]:
for fmt in formats:
    print(fmt)
    with xr.open_dataset(f"test-netcdf4-{fmt}.nc", concat_characters=False) as ds:
        display(ds.load())

# Char Arrays
## Explicit fixed width 'S1' arrays re-encoded creating extra dimension
https://github.com/pydata/xarray/issues/2899

Related:

- Set/preserve the character array dimension name https://github.com/pydata/xarray/issues/2895
- Handle the character array dim name https://github.com/pydata/xarray/pull/2896
- Save 'S1' array without the char_dim_name dimension https://github.com/pydata/xarray/issues/3407
- to_netcdf from subsetted Dataset with strings loaded from char array netCDF can sometimes fail https://github.com/pydata/xarray/issues/6352

### 2899

unresolved

In [None]:
from collections import OrderedDict
import xarray as xr
import numpy as np

sensor_string_np = np.zeros([12, 100], dtype="|S1")
data_vars = {}
data_vars["sensorName"] = xr.DataArray(
    data=sensor_string_np.copy(),
    attrs=OrderedDict(
        [
            ("_FillValue", " "),
        ]
    ),
    name="sensorName",
    dims=("sensor", "string"),
)

scanfile = xr.Dataset(data_vars=data_vars)
scanfile.sensorName[0, : len("test")] = np.frombuffer("test".encode(), dtype="|S1")
scanfile.to_netcdf("test-2889.nc")

In [None]:
with xr.open_dataset("test-2889.nc") as ds:
    display(ds.load())

In [None]:
#!ncdump test-2889.nc

### 3407

has workaround

In [None]:
import numpy as np
import xarray as xr

tstr = "2019-07-25_00:00:00"
Times = xr.DataArray(
    np.array([" ".join(tstr).split()], dtype="S1"), dims=["Time", "DateStrLen"]
)
ds = xr.Dataset({"Times": Times})
display(ds)
ds.to_netcdf(
    "test-3407-01.nc",
    format="NETCDF4",
    encoding={"Times": {"zlib": True, "complevel": 5}},
    unlimited_dims={"Time": True},
)

In [None]:
with xr.open_dataset("test-3407-01.nc") as ds:
    display(ds.load())

In [None]:
!ncdump test-3407-01.nc

In [None]:
import numpy as np
import xarray as xr

tstr = "2019-07-25_00:00:00"
Times = xr.DataArray(np.array([tstr], dtype=np.dtype(("S", 16))), dims=["Time"])
ds = xr.Dataset({"Times": Times})
ds.to_netcdf(
    "test-3407-02.nc",
    format="NETCDF4",
    encoding={"Times": {"zlib": True, "complevel": 5, "char_dim_name": "DateStrLen"}},
    unlimited_dims={"Time": True},
)

In [None]:
with xr.open_dataset("test-3407-01.nc") as ds:
    display(ds.load())

In [None]:
!ncdump test-3407-02.nc

### 6352

In [None]:
# setup
import numpy as np
import xarray as xr

one_two = xr.DataArray(np.array(["a", "aa"], dtype="object"), dims=["dim0"])
two_two = xr.DataArray(np.array(["aa", "aa"], dtype="object"), dims=["dim0"])
ds = xr.Dataset({"var0": one_two, "var1": two_two})
ds.var0.encoding["dtype"] = "S1"
ds.var1.encoding["dtype"] = "S1"
# need to write out and read back in
ds.to_netcdf("test-6352-01.nc")
display(ds)

# only selecting the shorter string will fail
ds1 = xr.load_dataset("test-6352-01.nc")
ds1[{"dim0": 1}].to_netcdf("test-6352-ok.nc")
try:
    ds1[{"dim0": 0}].to_netcdf("test-6352-error.nc")
except IndexError as e:
    warnings.warn(*e.args)
    pass

# will work if the char dim name is removed from encoding of the now shorter arr
ds1 = xr.load_dataset("test-6352-01.nc")
del ds1.var0.encoding["char_dim_name"]
ds1[{"dim0": 0}].to_netcdf("test-6352-will_work.nc")

In [None]:
!ncdump test-6352-ok.nc

In [None]:
!ncdump test-6352-error.nc

In [None]:
!ncdump test-6352-will_work.nc

In [None]:
!h5dump test-6352-will_work.nc

### concat_characters

- Change default for concat_characters to False in open_* functions https://github.com/pydata/xarray/issues/4452
- open_zarr: concat_characters has no effect when dtype=U1 https://github.com/pydata/xarray/issues/4405


In [None]:
import xarray as xr
import numpy as np

chrs = np.array(
    [
        ["A", "B"],
        ["C", "D"],
        ["E", "F"],
    ],
    dtype="S1",
)
ds = xr.Dataset(dict(x=(("dim0", "dim1"), chrs)))
with xr.set_options(display_style="text"):
    display(ds.x)

ds.to_zarr("test-4405-01.zarr", mode="w")
# The second dimension is lost and the values end up being concatenated
with xr.set_options(display_style="text"):
    display(xr.open_zarr("test-4405.zarr").x.compute())

In [None]:
chrs = np.array(
    [
        ["A", "B"],
        ["C", "D"],
        ["E", "F"],
    ],
    dtype="U1",
)
ds = xr.Dataset(dict(x=(("dim0", "dim1"), chrs)))
ds.to_zarr("test-4405-01", mode="w")
# No concatenation occurs
with xr.set_options(display_style="text"):
    display(xr.open_zarr("test-4405-01", concat_characters=True).x.compute())

## Chunksizes

- to_netcdf broken encoding: dtype='S1' + chunksizes https://github.com/pydata/xarray/issues/2219


In [None]:
xr.Dataset({"x": ["foo", "bar", "baz"]}).to_netcdf(
    "test-2219-01.nc",
    engine="h5netcdf",
    encoding={"x": {"dtype": "S1", "zlib": True, "chunksizes": (2,)}},
)

In [None]:
xr.Dataset({"x": ["foo", "bar", "baz"]}).to_netcdf(
    "test-2219-02.nc",
    engine="h5netcdf",
    encoding={"x": {"dtype": "S1", "zlib": True, "chunksizes": (2, 3)}},
)

## Automatic dtype encoding in to_netcdf
https://github.com/pydata/xarray/issues/2780

In [None]:
import pandas as pd
import xarray as xr
import numpy as np
import os

# Create pandas DataFrame
df = pd.DataFrame(
    np.random.randint(low=0, high=10, size=(100000, 5)),
    columns=["a", "b", "c", "d", "e"],
)

# Make 'e' a column of strings
df["e"] = df["e"].astype(str)

# Save to csv
df.to_csv("test-2780-01.csv")

# Convert to an xarray's Dataset
ds = xr.Dataset.from_dataframe(df)

# Save NetCDF file
ds.to_netcdf("test-2780-02.nc")

In [None]:
# Compute stats
stats1 = os.stat("test-2780-01.csv")
stats2 = os.stat("test-2780-02.nc")
print("csv=", str(stats1.st_size))
print("nc =", str(stats2.st_size))
print("nc/csv=", str(stats2.st_size / stats1.st_size))

## Netcdf char array not being decoded to string in compound dtype
https://github.com/pydata/xarray/issues/1977

Related: https://github.com/Unidata/netcdf4-python/pull/778

In [None]:
script_nc_file = "source/bolo_geom_fromscript.nc"
with xr.open_dataset(
    script_nc_file, group="/bolo/sxd", concat_characters=True, engine="h5netcdf"
) as ds:
    ds = ds.copy(deep=True)
da = ds.slits
display(da.values["Object_type"])
dtype = da.dtype[0]
print(dir(dtype))
print(dtype.type)
# print(dtype.subtype)
print(dtype.kind)
print(dtype.metadata)
print(dtype.shape)
print(dtype.str)
print(dtype.descr)
print(dtype.fields)
print(dtype.char)
arr_dtype = da.values["Object_type"].dtype
print(dtype.shape)
da.values["Object_type"].view(dtype=np.dtype(f"|{arr_dtype.kind}{dtype.shape[0]}"))

In [None]:
import h5netcdf

with h5netcdf.File(script_nc_file) as h5:
    print(h5["bolo/sxd"]["slits"]["Object_type"].view(dtype.str))

## MADIS netCDF to Pandas Dataframe: ValueError: iterator is too large

https://github.com/pydata/xarray/issues/838

In [None]:
import xarray as xr

ncf = xr.open_dataset("source/20160430_1600.nc", decode_cf=False)
display(ncf)
df = ncf.to_dataframe()

## netCDF4: support byte strings as attribute values
https://github.com/pydata/xarray/issues/7186


In [None]:
import numpy as np
import xarray as xr

data = np.ones([12, 10])
ds = xr.Dataset(
    {"data": (["x", "y"], data)}, coords={"x": np.arange(12), "y": np.arange(10)}
)
ds["x"].attrs["first_str"] = "foo"
ds["x"].attrs["second_str"] = "bar°"
ds["x"].attrs["third_str"] = "hää"
ds["x"].attrs["workaround_str"] = np.array("hää".encode("utf-8"))
ds.to_netcdf("test-7186.nc")

In [None]:
!ncdump test-7186.nc

## dataset attrs list of strings to_netcdf() error
https://github.com/pydata/xarray/issues/7608

Related https://github.com/pydata/xarray/issues/3374

netcdf-3 vs netcdf-4 problem

In [None]:
import xarray as xr
import pandas as pd
import numpy as np

ds = xr.Dataset()

temp = 15 + 8 * np.random.randn(2, 2, 3)

precip = 10 * np.random.rand(2, 2, 3)

lon = [[-99.83, -99.32], [-99.79, -99.23]]

lat = [[42.25, 42.21], [42.63, 42.59]]

ds["temperature"] = (("x", "y", "time"), temp)

ds["temperature_double"] = (("x", "y", "time"), temp * 2)

ds["precipitation"] = (("x", "y", "time"), precip)

ds.coords["lat"] = (("x", "y"), lat)

ds.coords["lon"] = (("x", "y"), lon)

ds.coords["time"] = pd.date_range("2014-09-06", periods=3)

ds.coords["reference_time"] = pd.Timestamp("2014-09-05")

ds.attrs["test"] = [1, 2, 3]
ds.to_netcdf("test-7608-01.nc")
# no error

ds.attrs["test"] = ["11", "12", "13"]
ds.to_netcdf("test-7608-02.nc")

## Volatile error: unsupported dtype for netCDF4 variable: object

https://github.com/pydata/xarray/issues/2404

In [None]:
import pandas as pd
import numpy as np
import xarray as xr  # xr.__version --> 0.13.0
import os
import itertools

# make a multi index (where one level is np.str_ type)
x = list(np.array([np.str_("idx_%i") % i for i in range(1, 11)], dtype=np.str_))
y = list(np.arange(10))
combo = list(itertools.product(x, y))
x, y = zip(*combo)

# the below is an odd way to construct a DataFrame, but the np.str_ type is preserved if done this way
data_df = np.random.randn(len(x))
df = pd.DataFrame(data=data_df, columns=["test"])
df["x"] = x
df["y"] = y
df = df.set_index(["x", "y"])
ds = xr.Dataset.from_dataframe(df)
display(ds)

In [None]:
type(ds.coords["x"].values[0])

In [None]:
ds.to_netcdf("test-2404.nc")

In [None]:
with xr.open_dataset("test-2404.nc") as ds:
    display(ds)

In [None]:
!ncdump test-2404.nc