The size of the dataset was one issue I ran into while analyzing the competition data. Due to a memory shortage, it was difficult to compute several of the dataset's important properties. This notebook's objective is to compute a few features chunkwise and store those results as its output. These properties can then be included into models or used for EDA.

In [1]:
# sys imports
import os
from pathlib import Path
import gc

# data imports
import h5py
import hdf5plugin
import numpy as np
import pandas as pd

We will use the GPU to speed up calculations by a factor of almost 5.

In [2]:
#create input file list
f_list = list(Path('../input/mmscel-data-transposed/').glob('./*.h5'))
f_list

[PosixPath('../input/mmscel-data-transposed/test_multi_inputs.h5'),
 PosixPath('../input/mmscel-data-transposed/train_cite_targets.h5'),
 PosixPath('../input/mmscel-data-transposed/train_multi_inputs.h5'),
 PosixPath('../input/mmscel-data-transposed/train_cite_inputs.h5'),
 PosixPath('../input/mmscel-data-transposed/train_multi_targets.h5'),
 PosixPath('../input/mmscel-data-transposed/test_cite_inputs.h5')]

In [3]:
# define output directory
out_dir = Path('../output/')
out_dir

PosixPath('../output')

In [4]:
for f_name in f_list:
    print(f"### OPERATING ON: {f_name} ###")
    
    # Get properties of dataset
    with h5py.File(f_name, 'r') as f:
        cell_names = [a.decode('utf-8') for a in f['cells']]
        feature_names = [a.decode('utf-8') for a in f['features']]
        
    num_features = len(feature_names)
    num_cells = len(cell_names)
    
    batchsize = int(200_000_000 / (num_cells))
    iterations = int(np.ceil(num_features/batchsize))
    
    # the features we want to calculate for every cell
    target_features = {
        'count_non_zero': np.zeros(num_features, dtype=np.int32),
        'max_value': np.zeros(num_features, dtype=np.float64),
        'min_value': np.zeros(num_features, dtype=np.float64),
        'sum_values': np.zeros(num_features, dtype=np.float64),
        'mean_non_zero': np.zeros(num_features, dtype=np.float64),
        'std_dev_non_zero': np.zeros(num_features, dtype=np.float64),  
    }
    
    # iterate over dataset calculating features
    for i in range(iterations):
        ###### SPECIFICATION WHAT DATA TO LOAD AND LOADING OF DATA ONTO DEVICE (CPU OR CUDA) #########
        S_INDEX = i * batchsize
        E_INDEX = (i+1) * batchsize

        # load data and send to torch device
        with h5py.File(f_name, 'r') as f:
            data = np.array(f['values'][S_INDEX : E_INDEX])

        ##### CALCULATION OF FEATURES #####
        target_features['count_non_zero'][S_INDEX:E_INDEX] = (data > 0).sum(axis=1)
        target_features['max_value'][S_INDEX:E_INDEX] = data.max(axis=1)
        target_features['min_value'][S_INDEX:E_INDEX] = data.min(axis=1)
        target_features['sum_values'][S_INDEX:E_INDEX] = data.sum(axis=1)
        target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
        target_features['std_dev_non_zero'][S_INDEX:E_INDEX] = data.std(axis=1, where=(data>0))
        # target_features['feature'][S_INDEX:E_INDEX] = 
    
    # calculations done, define index, build dataframe and safe as csv
    df = pd.DataFrame(data=target_features, index=feature_names)
    df.to_csv(out_dir / (f_name.stem + '_features_feature.csv'))

### OPERATING ON: ../input/mmscel-data-transposed/test_multi_inputs.h5 ###


  target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = um.true_divide(


### OPERATING ON: ../input/mmscel-data-transposed/train_cite_targets.h5 ###
### OPERATING ON: ../input/mmscel-data-transposed/train_multi_inputs.h5 ###


  target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = um.true_divide(


### OPERATING ON: ../input/mmscel-data-transposed/train_cite_inputs.h5 ###


  target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = um.true_divide(


### OPERATING ON: ../input/mmscel-data-transposed/train_multi_targets.h5 ###


  target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = um.true_divide(


### OPERATING ON: ../input/mmscel-data-transposed/test_cite_inputs.h5 ###


  target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.mean(axis=1, where=(data>0))
  ret = um.true_divide(
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = um.true_divide(
