In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import glob, os
import numpy as np
import zlib

# ---------------------------------------
# 1) Compression pipeline (once, offline)
# ---------------------------------------
coords_list = [
    np.array([[0.0, 1.0, 2.0],
              [3.0, 4.0, 5.0]], dtype=np.float32),
    np.array([[6.0, 7.0, 8.0]],        dtype=np.float32)
]
elm_lists = [
    np.array([6, 7], dtype=np.uint8),
    np.array([8],    dtype=np.uint8)
]
segment_ids        = np.array(['33A3','67B0'], dtype='S4')
segment_lengths    = np.array([2,1],         dtype=np.uint16)
per_segment_scalars= np.array([0.5,1.2],    dtype=np.float32)

# Flatten
coords_flat = np.vstack(coords_list)    # (3,3)
elms_flat   = np.concatenate(elm_lists)# (3,)

# Quantize & delta
precision = 0.001
quantized = np.round(coords_flat/precision).astype(np.int32)
deltas    = quantized.copy()
deltas[1:] -= quantized[:-1]

# int16 + serialize
deltas16   = deltas.astype(np.int16)
coord_bytes= deltas16.tobytes()
elm_bytes  = elms_flat.tobytes()

# Compress
blob       = coord_bytes + elm_bytes
compressed = zlib.compress(blob, level=9)
with open('pseudomini.coords_elms.zlib','wb') as f:
    f.write(compressed)

# Sidecars
np.save('pseudomini.segment_ids.npy',         segment_ids)
np.save('pseudomini.segment_lengths.npy',     segment_lengths)
np.save('pseudomini.per_segment_scalars.npy', per_segment_scalars)

print("Compressed size:", len(compressed), "bytes")

# ---------------------------------------
# 2) Dataset & DataLoader definitions
# ---------------------------------------
class CompressedCoordsDataset(Dataset):
    def __init__(self, base_dir, precision=0.001, pin_memory=False):
        self.base      = base_dir.rstrip("/") + "/"
        self.precision = precision
        self.pin_memory= pin_memory

    def __len__(self):
        # In this example, only one sample
        return 1

    def __getitem__(self, idx):
        # --- Load metadata ---
        #seg_ids  = np.load(self.base + 'segment_ids.npy',         allow_pickle=True).astype(str)
        seg_lens = np.load(self.base + 'segment_lengths.npy')     # (M,)
        scalars  = np.load(self.base + 'per_segment_scalars.npy') # (M,)

        # --- Decompress blob ---
        with open(self.base + 'coords_elms.zlib','rb') as f:
            comp = f.read()
        blob = zlib.decompress(comp)

        # --- Split streams ---
        N = int(seg_lens.sum())
        coord_len = N*3*np.dtype(np.int16).itemsize
        elm_len   = N*np.dtype(np.uint8).itemsize

        coord_bytes = blob[:coord_len]
        elm_bytes   = blob[coord_len:coord_len+elm_len]

        # --- Reconstruct coords ---
        deltas_rec  = np.frombuffer(coord_bytes, dtype=np.int16).reshape(N,3).astype(np.int32)
        quant_rec   = np.cumsum(deltas_rec, axis=0)
        coords_flat = quant_rec.astype(np.float32) * self.precision  # (N,3)

        # --- Reconstruct elements ---
        elms_flat   = np.frombuffer(elm_bytes, dtype=np.uint8)       # (N,)

        # --- Split by segment_lengths ---
        offsets     = np.concatenate([[0], np.cumsum(seg_lens)]).astype(int)
        coords_list = [coords_flat[offsets[i]:offsets[i+1]] for i in range(len(seg_lens))]
        elms_list   = [elms_flat[  offsets[i]:offsets[i+1]] for i in range(len(seg_lens))]

        # --- To torch tensors ---
        zs = [torch.tensor(e, dtype=torch.int32).pin_memory() if self.pin_memory 
              else torch.tensor(e, dtype=torch.int32) for e in elms_list]
        xs = [torch.tensor(x, dtype=torch.float32).pin_memory() if self.pin_memory 
              else torch.tensor(x, dtype=torch.float32) for x in coords_list]
        ys = [torch.tensor(s, dtype=torch.float32).pin_memory() if self.pin_memory 
              else torch.tensor(s, dtype=torch.float32) for s in scalars]

        return zs, xs, ys

# Helper to get a DataLoader
def get_loader(base_dir, batch_size=1, pin_memory=False):
    ds = CompressedCoordsDataset(base_dir, pin_memory=pin_memory)
    return DataLoader(ds, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

# ---------------------------------------
# 3) Example usage
# ---------------------------------------
if __name__ == "__main__":
    loader = get_loader(".", batch_size=1, pin_memory=False)
    for zs, xs, ys in loader:
        print("data loader:")
        print("zs:", zs)
        print("xs:", xs)
        print("ys:", ys)
        break


Compressed size: 21 bytes
data loader:
zs: [tensor([[6, 7]], dtype=torch.int32), tensor([[8]], dtype=torch.int32)]
xs: [tensor([[[0.0000, 1.0000, 2.0000],
         [3.0000, 4.0000, 5.0000]]]), tensor([[[6.0000, 7.0000, 8.0000]]])]
ys: [tensor([0.5000]), tensor([1.2000])]


0.2s to execute on shitty 2015 macbook pro