In [17]:
import numpy as np
import zstandard as zstd
import struct

# 1) Quantize your float32 coords to int32 without loss:
#    We’ll reinterpret the 32-bit float bits as a 32-bit int
#    so that decode(encode(f))→exact bits back.
coordsog = np.random.randn(10000,3).astype('f4')   # example data
qi = coords.view(np.int32).reshape(-1)           # flatten to 1D int32

# 2) Delta-encode the int sequence:
#    diffs[i] = qi[i] - qi[i-1], with diffs[0] = qi[0]
diffs = np.empty_like(qi)
diffs[0] = qi[0]
diffs[1:] = qi[1:] - qi[:-1]

# 3) Varint-encode the diffs (LEB128 unsigned for non-negative, zigzag for signed):
def zigzag_encode(n):
    """Map signed int → unsigned for varint (zig-zag)."""
    return (n << 1) ^ (n >> 31)

def varint_encode(values):
    """Encode an array of non-negative ints into varints."""
    buf = bytearray()
    for v in values:
        x = v
        while True:
            to_write = x & 0x7F
            x >>= 7
            if x:
                buf.append(0x80 | to_write)
            else:
                buf.append(to_write)
                break
    return buf

# Apply zigzag to handle negative diffs:
zz = np.vectorize(zigzag_encode, otypes=[np.int64])(diffs)
# Then varint-encode:
coord_stream = varint_encode(zz.tolist())

# 4) Varint-encode your other uint fields (e.g. segment IDs, element codes):
seg_ids = np.random.randint(0, 256, size=1000, dtype=np.uint8)
elm_codes = np.random.randint(0, 64, size=10000, dtype=np.uint8)

# simple concatenation: [seg0, seg1, …] then [elm0, elm1, …]
# since they’re small uint8, varint is overkill—just raw bytes:
seg_stream = bytes(seg_ids.tolist())
elm_stream = bytes(elm_codes.tolist())

# 5) Concatenate into one byte blob:
blob = coord_stream + elm_stream

# 6) Compress with Zstandard:
cctx = zstd.ZstdCompressor(level=10)
compressed = cctx.compress(blob)
np.save('segment_ids.npy', seg_ids)
# Save to disk:
with open('mol.methodB2.zst', 'wb') as f:
    f.write(compressed)

#137

In [33]:
counter=1

In [51]:
ß

Compressed blob size: 27 bytes


In [52]:
for a in train_loader:
    print(a)

[[tensor([[6, 7]], dtype=torch.int32), tensor([[8]], dtype=torch.int32)], [tensor([[[0.0000, 1.0000, 2.0000],
         [3.0000, 4.0000, 5.0000]]]), tensor([[[6.0000, 7.0000, 8.0000]]])], [tensor([0.5000]), tensor([1.2000])]]


In [31]:
import numpy as np

# --- Suppose this is your original nested data: ---
all_pdbs = [
    [   # PDB 0: 2 segments
        np.random.rand(5,3),   # segment 0: 5 atoms
        np.random.rand(3,3)    # segment 1: 3 atoms
    ],
    [   # PDB 1: 1 segment
        np.random.rand(4,3)    # segment 0: 4 atoms
    ],
    # … more PDBs …
]

# 1) Build length arrays
pdb_segment_counts = np.array([len(pdb) for pdb in all_pdbs], dtype=np.uint16)
segment_lengths    = np.array(
    [ seg.shape[0] for pdb in all_pdbs for seg in pdb ],
    dtype=np.uint16
)

# 2) Flatten coords
coords_flat = np.vstack([ np.vstack(pdb) for pdb in all_pdbs ])  # (N_total,3)

# 3) RECONSTRUCT: split into segments
seg_offsets = np.concatenate([[0], np.cumsum(segment_lengths)]).astype(int)
segments = [
    coords_flat[ seg_offsets[k] : seg_offsets[k+1] ]
    for k in range(len(segment_lengths))
]

# 4) RECONSTRUCT: group segments into PDBs
pdb_offsets = np.concatenate([[0], np.cumsum(pdb_segment_counts)]).astype(int)
recovered_pdbs = [
    segments[ pdb_offsets[i] : pdb_offsets[i+1] ]
    for i in range(len(pdb_segment_counts))
]

# Verify round-trip
for orig_pdb, rec_pdb in zip(all_pdbs, recovered_pdbs):
    for orig_seg, rec_seg in zip(orig_pdb, rec_pdb):
        assert np.allclose(orig_seg, rec_seg)
print("Round-trip successful!")


Round-trip successful!


In [32]:
recovered_pdbs

[[array([[0.59556396, 0.09375599, 0.89481776],
         [0.9041849 , 0.79032441, 0.71217083],
         [0.25738574, 0.4375951 , 0.80362118],
         [0.06316097, 0.34866813, 0.76180718],
         [0.80731694, 0.2019121 , 0.70084665]]),
  array([[0.13122803, 0.157987  , 0.9748602 ],
         [0.68347895, 0.30285273, 0.40699109],
         [0.24661408, 0.91203201, 0.72042678]])],
 [array([[0.87792898, 0.66301325, 0.91495486],
         [0.00524254, 0.62356855, 0.7082905 ],
         [0.80699906, 0.3634751 , 0.93321321],
         [0.63441355, 0.61855422, 0.18708782]])]]

In [18]:
#82
import numpy as np, zstandard as zstd

# coords: N×3 float32
precision = 0.001                   # 1 unit = 0.001 Å
qi = np.round(coordsog / precision).astype(np.int32)

# delta‐encode in place (so small successive diffs)
diffs = qi.copy()
diffs[1:] -= qi[:-1]

# pack raw int32 bytes
buf = diffs.tobytes() + bytes(elm_codes.tolist())

# compress with zstd at high level
cctx = zstd.ZstdCompressor(level=19)
with open('mol_fixed_delta2.zst','wb') as f:
    f.write(cctx.compress(buf))

np.save('segment_ids2.npy', seg_ids)


In [7]:
np.savez_compressed("coordstest2",coords=diffs.tobytes(), ids=seg_ids.tobytes(),zs=bytes(elm_codes.tolist()))

In [11]:
np.load("./coordstest2.npz")["coords"]

array(b'\xbd\x00\x00\x00\x14\xfc\xff\xffV\xff\xff\xffl\xf9\xff\xff\x10\x01\x00\x00]\xfd\xff\xff\xd6\x02\x00\x00\x12\t\x00\x00Y\x00\x00\x00\xac\x04\x00\x00n\xfc\xff\xff)\xfc\xff\xff\xa4\xff\xff\xff\x1f\x03\x00\x00\x1f\x01\x00\x00\xcc\x05\x00\x00\xe0\xfb\xff\xffF\x0c\x00\x00\x99\xfa\xff\xff\\\xfc\xff\xff\xcc\xf1\xff\xff\xc0\xf6\xff\xffA\x01\x00\x002\x05\x00\x00\xa1\x06\x00\x00\xf5\xfd\xff\xff\r\x00\x00\x00\xd1\x00\x00\x00\xa7\xff\xff\xff&\x02\x00\x00\x82\x01\x00\x00\xbf\xfe\xff\xff\x13\x02\x00\x00H\x04\x00\x00\xc7\x07\x00\x00\x1b\xf8\xff\xffQ\xf5\xff\xff\xb5\xfc\xff\xff\xe0\n\x00\x00\xfd\t\x00\x00A\x01\x00\x00\xef\xfb\xff\xff\xff\xfc\xff\xff\x84\xfb\xff\xff\x86\xfd\xff\xff\x18\xfb\xff\xff9\x01\x00\x00\x07\xfd\xff\xff\x01\x03\x00\x00B\x03\x00\x009\xfd\xff\xff,\xff\xff\xff\xe6\xfb\xff\xff\xdc\x06\x00\x00O\xfa\xff\xff"\xff\xff\xff\xf7\x03\x00\x00\x1c\n\x00\x00\xf3\x00\x00\x00e\xf9\xff\xff\xae\xf8\xff\xff\xc7\x03\x00\x00j\x08\x00\x00\x0b\x02\x00\x00\xfe\xff\xff\xff\xd8\xfc\xff\xff\xf0\xff\xf

In [19]:
import numpy as np
import zlib

# --- User parameters (adjust as needed) ---
N_atoms        = 10000                   # total number of atoms
precision      = 0.001                   # fixed-point scale (1 unit = 0.001 Å)
coord_dtype    = np.int32                # type used for quantized deltas
seg_dtype      = np.uint8                # type used for segment indices
elm_dtype      = np.uint8                # type used for element codes

with open('mol_fixed_delta2.zst', 'rb') as f:
    compressed = f.read()

dctx = zstd.ZstdDecompressor()
decompressed = dctx.decompress(compressed)

# --- 2) Split streams by known byte-lengths ---
coord_bytes_len = N_atoms * 3 * np.dtype(coord_dtype).itemsize
coord_bytes = decompressed[:coord_bytes_len]
start = coord_bytes_len
seg_bytes = decompressed[start:start + N_atoms * np.dtype(seg_dtype).itemsize]
elm_bytes = decompressed[start + N_atoms * np.dtype(seg_dtype).itemsize:
                         start + N_atoms * (np.dtype(seg_dtype).itemsize + np.dtype(elm_dtype).itemsize)]

# --- 3) Reconstruct coordinates ---
deltas = np.frombuffer(coord_bytes, dtype=coord_dtype).reshape(N_atoms, 3)
qi     = np.cumsum(deltas, axis=0)               # quantized ints
coords = qi.astype(np.float64) * precision       # back to Å

# --- 4) Recover segment & element arrays ---
seg_idxs    = np.frombuffer(seg_bytes, dtype=seg_dtype)
elm_codes   = np.frombuffer(elm_bytes, dtype=elm_dtype)

# --- 5) Load your segment ID table & map back ---
segment_ids     = np.load('segment_ids.npy', allow_pickle=True).astype(str)
atom_segment_ids = segment_ids[seg_idxs]

# Now you have:
# - coords            : (N_atoms, 3) float64 array of XYZ in Å
# - atom_segment_ids  : (N_atoms,) array of '33A3', '67B0', …
# - elm_codes         : (N_atoms,) uint8 element codes

print(f"Loaded {coords.shape[0]} atoms with full fidelity.")

Loaded 10000 atoms with full fidelity.


In [20]:
coords

array([[ 0.94 ,  2.014, -0.405],
       [ 0.213, -0.271, -0.647],
       [ 0.25 , -1.895,  0.456],
       ...,
       [ 0.981,  0.342, -0.015],
       [-2.443,  0.021, -0.226],
       [ 0.858,  0.477, -0.046]])

In [22]:
coordsog

array([[ 0.94017744,  2.0140045 , -0.4054155 ],
       [ 0.2126579 , -0.27116695, -0.6470783 ],
       [ 0.25036946, -1.8948692 ,  0.45640743],
       ...,
       [ 0.9812382 ,  0.3418294 , -0.01498962],
       [-2.443291  ,  0.02130226, -0.22580786],
       [ 0.8579062 ,  0.47687083, -0.04556439]], dtype=float32)