
## Inspecting .datx / HDF5 Files
### This notebook will:
1. Open an HDF5-based `.datx` file.
2. Recursively traverse all groups and datasets.
3. Collect and display for each object:
    - Name and full path
    - Type (Group or Dataset)
    - Shape & dtype (for datasets)
    - Attributes (as key→value dict)
4. Present the results in Python dict and tabular form.

In [1]:
# Dependencies
import h5py
import json
from pprint import pprint
import pandas as pd
import numpy as np

### 1. Define a recursive inspection function
Walk the file and build a dictionary of metadata descriptors.

In [2]:
def inspect_hdf5(file_path):
    """
    Inspect an HDF5 file (.h5, .datx) and return a dict describing
    all groups and datasets, including attributes and dataset shapes/dtypes.
    """
    def _visit(name, obj, collector):
        entry = {
            'path': name,
            'type': 'Group' if isinstance(obj, h5py.Group) else 'Dataset',
            'attrs': dict(obj.attrs)
        }
        if isinstance(obj, h5py.Dataset):
            entry.update({
                'shape': obj.shape,
                'dtype': str(obj.dtype)
            })
        collector[name] = entry

    descriptors = {}
    with h5py.File(file_path, 'r') as f:
        f.visititems(lambda name, obj: _visit(name, obj, descriptors))
    return descriptors

## 2. Load and inspect your `.datx` or `.h5` file
Replace `'path/to/your/file.datx'` below with your actual filename.

In [3]:
# Example usage
file_name = '/Users/elbert/prysm_play/FS cuts/Middle.datx'  # ← update this path
metadata = inspect_hdf5(file_name)

print("### Full Metadata Descriptor Dict:")
pprint(metadata)


### Full Metadata Descriptor Dict:
{'Attributes': {'attrs': {'File Layout Version': array([1], dtype=int32)},
                'path': 'Attributes',
                'type': 'Group'},
 'Attributes/System': {'attrs': {},
                       'path': 'Attributes/System',
                       'type': 'Group'},
 'Attributes/{5CB51FA7-9361-4A66-AAB3-EE9EE1D96588}': {'attrs': {'Data Context.Data Attributes.AGC': array([0], dtype=uint8),
                                                                 'Data Context.Data Attributes.AStopID': array(['Open'], dtype=object),
                                                                 'Data Context.Data Attributes.AStopSize': array([((b'LinearCat', b'MilliMeters', array([0.])), 1.86)],
      dtype=[('Converter', {'names': ['Category', 'BaseUnit', 'Parameters'], 'formats': ['O', 'O', 'O'], 'offsets': [0, 8, 16], 'itemsize': 32}), ('Value', '<f8')]),
                                                                 'Data Context.Data Attribute

## 3. Convert to Pandas DataFrame for Tabular View
Create a table summarizing each object in the file.

In [4]:
# Helper to make attribute values JSON-serializable
def serialize_attr(value):
    # Recursively handle numpy arrays, lists, tuples, scalars, and bytes
    if isinstance(value, np.ndarray):
        return serialize_attr(value.tolist())
    elif isinstance(value, (list, tuple)):
        return [serialize_attr(v) for v in value]
    elif isinstance(value, (np.generic,)):
        return value.item()
    elif isinstance(value, bytes):
        return value.decode(errors='ignore')
    else:
        return value

# Build DataFrame rows
rows = []
for path, info in metadata.items():
    attrs_raw = info.get('attrs', {})
    attrs_serial = {k: serialize_attr(v) for k, v in attrs_raw.items()}
    rows.append({
        'Path': path,
        'Type': info['type'],
        'Shape': info.get('shape', ''),
        'Dtype': info.get('dtype', ''),
        'Attrs': json.dumps(attrs_serial, ensure_ascii=False)
    })

# Create DataFrame
df = pd.DataFrame(rows)

df

Unnamed: 0,Path,Type,Shape,Dtype,Attrs
0,Attributes,Group,,,"{""File Layout Version"": [1]}"
1,Attributes/System,Group,,,{}
2,Attributes/{5CB51FA7-9361-4A66-AAB3-EE9EE1D96588},Group,,,"{""Data Context.Data Attributes.AGC"": [0], ""Dat..."
3,Data,Group,,,{}
4,Data/Intensity,Group,,,{}
5,Data/Intensity/{26EB0B6C-1F64-4AB9-BE85-C00707...,Dataset,"(1000, 1000)",int32,"{""Coordinates"": [[0, 0, 1000, 1000]], ""Group N..."
6,Data/Quality,Group,,,{}
7,Data/Quality/{D7DB3063-CC29-45D6-AE08-38703E18...,Dataset,"(1000, 1000)",float64,"{""Coordinates"": [[0, 0, 1000, 1000]], ""Group N..."
8,Data/Saturation Counts,Group,,,{}
9,Data/Saturation Counts/{0F6F4648-2693-46FA-8BB...,Dataset,"(1000, 1000)",int32,"{""Coordinates"": [[0, 0, 1000, 1000]], ""Group N..."
