# s3 range-get requests w/ AWS SDK

In [1]:
"""
Try to manually execute some S3 range-get requests 
using lower-level Python code (e.g., Python AWS SDK — 
I think this is the boto library) and decompress and 
decode them into numpy arrays using the information in the 
Kerchunk reference files (byte order, dimensionality, etc.). 
This will help you understand the steps youll eventually need to 
do in C++, and help you make sure what youre doing is actually possible 
(e.g., there arent weird web issues that arent your fault) in a 
friendlier interactive environment.

"""

import zlib
import json
import boto3
import boto
import fsspec
import fsspec.utils
import numpy as np
import xarray as xr
import zarr
import os
import ujson
import s3fs

import kerchunk.combine
from kerchunk.zarr import single_zarr
from kerchunk.combine import MultiZarrToZarr
from kerchunk.hdf import SingleHdf5ToZarr
from pathlib import Path

In [2]:
# url: 's3://era5-pds/2020/*/data/air_pressure_at_mean_sea_level.nc'[:2]
# replace with 01 etc prefixes 

s3 = boto3.client('s3')
s3_url = 's3://era5-pds/2020/01/data/air_pressure_at_mean_sea_level.nc'
byte_range = 'bytes=0-64'

s3_parts = s3_url.split("/")
bucket_name = s3_parts[2]
object_key = "/".join(s3_parts[3:])

response = s3.get_object(Bucket=bucket_name, Key=object_key, Range=byte_range)

# Read and print the content of the specified byte range
content = response['Body'].read()

print(content[:40])
binary_string = "{:08b}".format(int(content.hex(),16))
print(binary_string[:40])

b'\x89HDF\r\n\x1a\n\x00\x00\x00\x00\x00\x08\x08\x00\x04\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff'
1000100101001000010001000100011000001101


In [30]:
# given string of bytes and the json --> recreate the reading/parsing

# load meta data into python dict
json_path = '/Users/katrinasharonin/Downloads/kerchunkC/code/jupyter/01_air_pressure_at_mean_sea_level.json'
f = open(json_path)
meta = json.load(f)

# section out zones
print(meta.keys())
print(meta['refs'].keys())
print(len(meta['refs'].keys()))
print(meta['refs']['air_pressure_at_mean_sea_level/.zarray'])
print(meta['refs']['air_pressure_at_mean_sea_level/0.0.0'])
print(meta['refs']['air_pressure_at_mean_sea_level/0.0.1'])
print(meta['refs']['air_pressure_at_mean_sea_level/0.0.2'])

"""
zarr storage sec:
Multiple arrays can be stored in the same array store by 
associating each array with a different logical path.
A logical path is simply an ASCII string. The logical 
path is used to form a prefix for keys used by the array. 
For example, if an array is stored at logical path “foo/bar” 
then the array metadata will be stored under the key “foo/bar/.zarray”,
the user-defined attributes will be stored under the key “foo/bar/.zattrs”, 
and the chunks will be stored under keys like “foo/bar/0.0”, “foo/bar/0.1”, etc.

The compressed sequence of bytes for each chunk is stored 
under a key formed from the index of the chunk within 
the grid of chunks representing the array. To form a string key 
for a chunk, the indices are converted to strings and concatenated
with the period character (“.”) separating each index. For example, 
given an array with shape (10000, 10000) and chunk shape (1000, 1000) 
there will be 100 chunks laid out in a 10 by 10 grid. 
The chunk with indices (0, 0) provides data for rows 0-999 and columns 0-999 
and is stored under the key “0.0”; the chunk with indices (2, 4) provides 
data for rows 2000-2999 and columns 4000-4999 and is stored under the key “2.4”; etc.
"""


# compare to actual sub sections from zarr engine - first chunk
start_byte = meta['refs']['air_pressure_at_mean_sea_level/0.0.0'][1]
num_bytes = meta['refs']['air_pressure_at_mean_sea_level/0.0.0'][2]

byte_range = f'bytes={start_byte}-{start_byte+num_bytes}'
response = s3.get_object(Bucket=bucket_name, Key=object_key, Range=byte_range)
content = response['Body'].read()
print(content[:50])

buf = zlib.decompress(content)
chunk = np.frombuffer(buf, dtype='<f4')

print('\n')
print('chunk 1 using 0.0.0 indexing')
print(chunk)

dict_keys(['version', 'refs'])
dict_keys(['.zgroup', '.zattrs', 'air_pressure_at_mean_sea_level/.zarray', 'air_pressure_at_mean_sea_level/.zattrs', 'air_pressure_at_mean_sea_level/0.0.0', 'air_pressure_at_mean_sea_level/0.0.1', 'air_pressure_at_mean_sea_level/0.0.2', 'air_pressure_at_mean_sea_level/0.0.3', 'air_pressure_at_mean_sea_level/0.0.4', 'air_pressure_at_mean_sea_level/0.0.5', 'air_pressure_at_mean_sea_level/0.0.6', 'air_pressure_at_mean_sea_level/0.0.7', 'air_pressure_at_mean_sea_level/0.0.8', 'air_pressure_at_mean_sea_level/0.0.9', 'air_pressure_at_mean_sea_level/0.0.10', 'air_pressure_at_mean_sea_level/0.0.11', 'air_pressure_at_mean_sea_level/0.0.12', 'air_pressure_at_mean_sea_level/0.0.13', 'air_pressure_at_mean_sea_level/0.0.14', 'air_pressure_at_mean_sea_level/0.1.0', 'air_pressure_at_mean_sea_level/0.1.1', 'air_pressure_at_mean_sea_level/0.1.2', 'air_pressure_at_mean_sea_level/0.1.3', 'air_pressure_at_mean_sea_level/0.1.4', 'air_pressure_at_mean_sea_level/0.1.5', 'air_pr

In [33]:
# find the corresponding chunk in the xarray reading w/ zarr engine

# contrast against: xarray reading of zarr with engine
# chunk sizing: chunks":[24,100,100]

ds = xr.open_dataset("reference://", engine="zarr", backend_kwargs={
                    "consolidated": False,
                    "storage_options": {"fo": json_path, "remote_protocol": "s3","remote_options": {"anon": True}}
                    })

# print(ds)

first_chunk = ds.isel(time0=slice(0, 24), lat=slice(0, 100), lon=slice(0, 100))
print(first_chunk.to_array())
print(first_chunk.to_array().size)
print('should equal...')
print(len(chunk))

first_flatten = first_chunk['air_pressure_at_mean_sea_level'].values.flatten()
matching_mask = (first_flatten == chunk)

# Check if any True values exist in the mask
print(matching_mask)
if matching_mask.any():
    print("flatten: the target array exists in the 'first_chunk' dataset.")
else:
    print("flatten: the target array does not exist in the 'first_chunk' dataset.")

for ex in chunk:
    if ex in first_flatten:
        print("match was found!")
print("No other possible matches found")

<xarray.DataArray (variable: 1, time0: 24, lat: 100, lon: 100)>
array([[[[ 99968.06,  99968.06,  99968.06, ...,  99968.06,  99968.06,
           99968.06],
         [ 99949.81,  99950.06,  99950.06, ...,  99965.81,  99965.81,
           99966.06],
         [ 99925.56,  99925.81,  99926.31, ...,  99958.56,  99958.81,
           99959.31],
         ...,
         [ 99567.56,  99568.81,  99571.81, ..., 100111.81, 100103.06,
          100097.56],
         [ 99708.31,  99709.56,  99710.56, ..., 100131.56, 100123.81,
          100118.31],
         [ 99839.56,  99840.81,  99841.56, ..., 100153.56, 100145.56,
          100140.06]],

        [[100021.31, 100021.31, 100021.31, ..., 100021.31, 100021.31,
          100021.31],
         [100008.06, 100008.06, 100008.31, ..., 100021.06, 100021.06,
          100021.31],
         [ 99986.31,  99986.56,  99986.81, ..., 100014.06, 100014.31,
          100014.56],
...
         [ 98630.56,  98661.31,  98692.06, ...,  98561.81,  98563.56,
           98567.5