
# Convert .mat to .npz

This notebook demonstrates how to convert MATLAB `.mat` files into compressed NumPy `.npz` files for easier use in Python training pipelines.

---

## Overview

- Load `.mat` files (supports v7 and v7.3/HDF5 formats).
- Extract arrays (e.g., signals, labels, sampling frequency).
- Optionally split into shards for large datasets.
- Save as `.npz` for efficient, portable storage.


In [1]:
import os
import numpy as np
import h5py
from scipy.io import loadmat
from pathlib import Path


In [2]:
def load_mat_file(path):
    """Load a .mat file (both v7 and v7.3)."""
    try:
        data = loadmat(path)
        # Remove meta keys
        data = {k: v for k, v in data.items() if not k.startswith('__')}
    except NotImplementedError:
        # v7.3 (HDF5) format
        data = {}
        with h5py.File(path, 'r') as f:
            for k in f.keys():
                data[k] = np.array(f[k])
    return data


In [3]:
def convert_mat_to_npz(mat_path, out_dir, shard_size=None):
    """Convert a .mat file into one or more .npz files."""
    mat_data = load_mat_file(mat_path)
    print("Keys in .mat file:", mat_data.keys())

    data = mat_data.get('data')
    fs = mat_data.get('fs', np.array([50.0]))  # default 50 Hz if not provided

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    if shard_size is None:
        out_path = out_dir / (Path(mat_path).stem + ".npz")
        np.savez_compressed(out_path, data=data, fs=fs)
        print("Saved:", out_path)
    else:
        n = data.shape[1]
        n_shards = int(np.ceil(n / shard_size))
        for i in range(n_shards):
            s, e = i*shard_size, min((i+1)*shard_size, n)
            shard_path = out_dir / f"{Path(mat_path).stem}_shard{i:03d}.npz"
            np.savez_compressed(shard_path, data=data[:,s:e,:], fs=fs)
            print(f"Saved shard {i+1}/{n_shards} -> {shard_path}")


In [5]:
# mat_file = "./example_dataset_for_training.mat"
mat_file = "./dataset_small_no_colored_noise_training.mat"
out_dir = "./" # current folder

# Convert without sharding
convert_mat_to_npz(mat_file, out_dir)

# Or convert with sharding (e.g., 1000 samples per shard)
# convert_mat_to_npz(mat_file, out_dir, shard_size=1000)


Keys in .mat file: dict_keys(['data', 'fs', 'time'])
Saved: dataset_small_no_colored_noise_training.npz
