In [1]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import ipywidgets as widgets
from ipywidgets import interact, IntSlider, Dropdown

# Set up plotting
plt.style.use('default')
%matplotlib inline

# File path - update this
filename = "/home/kronberger/Downloads/LineDataNewNew128.mat"

In [2]:
def explore_structure_safe(filename):
    """Explore file structure without loading large arrays"""
    print("=" * 50)
    print(f"EXPLORING: {filename}")
    print("=" * 50)
    
    datasets_info = {}
    
    with h5py.File(filename, 'r') as f:
        def collect_info_safe(name, obj):
            if isinstance(obj, h5py.Dataset):
                # Don't load data, just get metadata
                shape = obj.shape
                dtype = obj.dtype
                size_mb = np.prod(shape) * dtype.itemsize / (1024**2)
                
                datasets_info[name] = {
                    'shape': shape,
                    'dtype': str(dtype),
                    'size_mb': size_mb,
                    'dataset_obj': obj  # Keep reference for later use
                }
                
                print(f"\n📊 {name}")
                print(f"   Shape: {shape}")
                print(f"   Type: {dtype}")
                print(f"   Size: {size_mb:.1f} MB")
                
                # Interpret shapes for STM/AFM data
                if len(shape) == 2 and shape[0] == shape[1]:
                    print(f"   🖼️  Single {shape[0]}×{shape[1]} image/map")
                elif len(shape) == 3:
                    if shape[0] == shape[1]:  # spatial dimensions equal
                        print(f"   📚 Stack of {shape[2]} images, each {shape[0]}×{shape[1]} pixels")
                    else:
                        print(f"   📊 3D data: {shape}")
                elif len(shape) == 4:
                    print(f"   🗂️  4D dataset (possibly multiple channels/conditions): {shape}")
                
                # Warn about large datasets
                if size_mb > 100:
                    print(f"   ⚠️  Large dataset - use slicing for exploration")
        
        f.visititems(collect_info_safe)
        
    total_size = sum(info['size_mb'] for info in datasets_info.values())
    print(f"\n📈 SUMMARY:")
    print(f"   Total datasets: {len(datasets_info)}")
    print(f"   Total size: {total_size:.1f} MB")
    
    return datasets_info

# Run safe exploration
datasets_info = explore_structure_safe(filename)

EXPLORING: /home/kronberger/Downloads/LineDataNewNew128.mat

📊 x_raw
   Shape: (128, 2423296)
   Type: float64
   Size: 2366.5 MB
   ⚠️  Large dataset - use slicing for exploration

📊 y
   Shape: (6, 2423296)
   Type: float64
   Size: 110.9 MB
   ⚠️  Large dataset - use slicing for exploration

📈 SUMMARY:
   Total datasets: 2
   Total size: 2477.4 MB


In [None]:
def sample_large_dataset(dataset_obj, max_samples=5):
    """Sample a few slices from large datasets"""
    shape = dataset_obj.shape
    
    if len(shape) == 2:
        # Single 2D image - just return it
        return dataset_obj[:]
    elif len(shape) == 3:
        # Multiple images - sample a few
        n_images = shape[2]
        if n_images <= max_samples:
            indices = list(range(n_images))
        else:
            # Sample evenly distributed indices
            indices = np.linspace(0, n_images-1, max_samples, dtype=int)
        
        samples = {}
        for i, idx in enumerate(indices):
            samples[f"slice_{idx}"] = dataset_obj[:, :, idx]
        return samples
    elif len(shape) == 4:
        # 4D data - sample from last dimension
        n_slices = shape[3]
        if n_slices <= max_samples:
            indices = list(range(n_slices))
        else:
            indices = np.linspace(0, n_slices-1, max_samples, dtype=int)
        
        samples = {}
        for i, idx in enumerate(indices):
            samples[f"slice_{idx}"] = dataset_obj[:, :, :, idx]
        return samples
    else:
        print(f"Unsupported shape: {shape}")
        return None

def quick_preview(datasets_info):
    """Show quick previews of datasets"""
    
    with h5py.File(filename, 'r') as f:
        for name, info in datasets_info.items():
            print(f"\n🔍 Previewing: {name}")
            dataset_obj = f[name]
            
            # Sample data safely
            samples = sample_large_dataset(dataset_obj, max_samples=3)
            
            if samples is None:
                continue
            
            if isinstance(samples, dict):
                # Multiple samples
                n_samples = len(samples)
                fig, axes = plt.subplots(1, n_samples, figsize=(4*n_samples, 4))
                if n_samples == 1:
                    axes = [axes]
                
                for i, (slice_name, data) in enumerate(samples.items()):
                    if len(data.shape) == 2:
                        im = axes[i].imshow(data, cmap='viridis', aspect='auto')
                        axes[i].set_title(f"{name}\n{slice_name}")
                        plt.colorbar(im, ax=axes[i], shrink=0.8)
                    else:
                        # If still 3D, take middle slice
                        middle = data.shape[2] // 2
                        im = axes[i].imshow(data[:, :, middle], cmap='viridis', aspect='auto')
                        axes[i].set_title(f"{name}\n{slice_name}_mid")
                        plt.colorbar(im, ax=axes[i], shrink=0.8)
                
            else:
                # Single sample
                fig, ax = plt.subplots(1, 1, figsize=(6, 6))
                im = ax.imshow(samples, cmap='viridis', aspect='auto')
                ax.set_title(f"{name}")
                plt.colorbar(im, ax=ax)
            
            plt.tight_layout()
            plt.show()

# Preview datasets
quick_preview(datasets_info)


🔍 Previewing: x_raw


In [None]:
def create_slice_explorer(datasets_info):
    """Create interactive explorer for image stacks"""
    
    dataset_names = [name for name, info in datasets_info.items() 
                    if len(info['shape']) >= 3]
    
    if not dataset_names:
        print("No multi-dimensional datasets found for slicing")
        return
    
    @interact(
        dataset=Dropdown(options=dataset_names, description='Dataset:'),
        colormap=Dropdown(options=['viridis', 'hot', 'cool', 'plasma', 'gray'], 
                         value='viridis', description='Colormap:')
    )
    def explore_slices(dataset, colormap):
        info = datasets_info[dataset]
        print(f"📊 Dataset: {dataset}")
        print(f"   Shape: {info['shape']}")
        print(f"   Size: {info['size_mb']:.1f} MB")
        
        with h5py.File(filename, 'r') as f:
            dataset_obj = f[dataset]
            shape = dataset_obj.shape
            
            if len(shape) == 3:
                max_slice = shape[2] - 1
                
                @interact(slice_idx=IntSlider(min=0, max=max_slice, step=1, value=0))
                def show_slice(slice_idx):
                    # Load only one slice at a time
                    data_slice = dataset_obj[:, :, slice_idx]
                    
                    plt.figure(figsize=(8, 8))
                    im = plt.imshow(data_slice, cmap=colormap, aspect='auto')
                    plt.colorbar(im)
                    plt.title(f"{dataset} - Slice {slice_idx}/{max_slice}")
                    
                    # Add statistics
                    flat = data_slice.flatten()
                    plt.figtext(0.02, 0.02, 
                              f"Min: {np.min(flat):.3e}, Max: {np.max(flat):.3e}, Mean: {np.mean(flat):.3e}",
                              fontsize=8)
                    plt.show()
                    
            elif len(shape) == 4:
                max_slice = shape[3] - 1
                
                @interact(slice_idx=IntSlider(min=0, max=max_slice, step=1, value=0))
                def show_4d_slice(slice_idx):
                    # For 4D, show middle of third dimension
                    mid_3d = shape[2] // 2
                    data_slice = dataset_obj[:, :, mid_3d, slice_idx]
                    
                    plt.figure(figsize=(8, 8))
                    im = plt.imshow(data_slice, cmap=colormap, aspect='auto')
                    plt.colorbar(im)
                    plt.title(f"{dataset} - 4D Slice {slice_idx}/{max_slice}")
                    plt.show()

create_slice_explorer(datasets_info)

In [None]:
def export_selected_slices(datasets_info, export_dir="./exported_data"):
    """Export only selected slices to avoid memory issues"""
    import os
    import json
    
    os.makedirs(export_dir, exist_ok=True)
    
    metadata = {}
    
    with h5py.File(filename, 'r') as f:
        for name, info in datasets_info.items():
            dataset_obj = f[name]
            shape = dataset_obj.shape
            
            safe_name = name.replace('/', '_').replace(' ', '_')
            
            if len(shape) == 2:
                # Single image - export directly
                np.save(os.path.join(export_dir, f"{safe_name}.npy"), dataset_obj[:])
                metadata[name] = {
                    'filename': f"{safe_name}.npy",
                    'type': 'single_image',
                    'shape': shape
                }
                print(f"✅ Exported single image: {name}")
                
            elif len(shape) >= 3:
                # Export first, middle, and last slices
                n_slices = shape[-1]
                export_indices = [0, n_slices//2, n_slices-1]
                
                for i, slice_idx in enumerate(export_indices):
                    if len(shape) == 3:
                        slice_data = dataset_obj[:, :, slice_idx]
                    elif len(shape) == 4:
                        slice_data = dataset_obj[:, :, 0, slice_idx]
                    
                    filename_slice = f"{safe_name}_slice_{slice_idx}.npy"
                    np.save(os.path.join(export_dir, filename_slice), slice_data)
                    
                    metadata[f"{name}_slice_{slice_idx}"] = {
                        'filename': filename_slice,
                        'type': 'image_slice',
                        'original_shape': shape,
                        'slice_index': slice_idx,
                        'slice_shape': slice_data.shape
                    }
                
                print(f"✅ Exported 3 slices from: {name}")
    
    # Save metadata
    with open(os.path.join(export_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\n📁 Selected data exported to: {export_dir}")

# Uncomment to export sample slices
# export_selected_slices(datasets_info)