Generates a large dataset for experiments using `dask`

In [None]:
%pip install h5py dask

In [None]:
# generate dataset ~100GB
import h5py
import numpy as np
from tqdm import tqdm

dim1 = int(1e1)    
dim2 = int(1e6)    
dim3 = int(1e4)    
batch_size = 10000  

with h5py.File("mytestfile.hdf5", "w") as f:
    dset = f.create_dataset("mydataset", (dim1, dim2, dim3), dtype=bool)
    
    # Outer loop with progress bar
    for i in tqdm(range(dim1), desc="Processing dim1", unit="slice"):
        # Generate all random indices for this batch
        rand_indices = np.random.randint(0, dim3, size=dim2)
        
        # Inner batch processing with progress bar
        for j_start in tqdm(range(0, dim2, batch_size), 
                          desc=f"dim1={i}", 
                          unit="batch",
                          leave=False): 
            j_end = min(j_start + batch_size, dim2)
            batch_indices = rand_indices[j_start:j_end]
            
            # Create boolean array for this batch
            batch = np.zeros((j_end-j_start, dim3), dtype=bool)
            batch[np.arange(j_end-j_start), batch_indices] = True
            
            # Write the batch
            dset[i, j_start:j_end] = batch

Processing dim1: 100%|██████████| 10/10 [02:17<00:00, 13.72s/slice]


In [5]:
import dask.array as da
import h5py
with h5py.File("mytestfile.hdf5", "r") as f:
    dset = f['mydataset']

    test_crowd_matrix = da.from_array(dset, chunks=(10,10_000,10_000))


    T = test_crowd_matrix.sum(axis=1)

    tdim = T.sum(1, keepdims=True)
    T = da.where(tdim > 0, T / tdim, 0).compute()


In [None]:
# test_crowd_matrix sparsity
import h5py
import dask.array as da

with h5py.File("mytestfile.hdf5", "r") as f:
    dset = f['mydataset']

    test_crowd_matrix = da.from_array(dset, chunks=(10,1_000,1_000))




    sparsity = 1-(da.count_nonzero(test_crowd_matrix)/ test_crowd_matrix.size).compute()
    print(sparsity)