In [1]:
import os
import gc
from pathlib import Path

import h5py
import hdf5plugin

import numpy as np
import torch
import pandas as pd

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# list of files we want to analyse
f_list = list(Path('../input/open-problems-multimodal/').glob('./*[!T8].h5'))
f_list

[PosixPath('../input/open-problems-multimodal/test_multi_inputs.h5'),
 PosixPath('../input/open-problems-multimodal/train_cite_targets.h5'),
 PosixPath('../input/open-problems-multimodal/train_multi_inputs.h5'),
 PosixPath('../input/open-problems-multimodal/train_cite_inputs.h5'),
 PosixPath('../input/open-problems-multimodal/train_multi_targets.h5'),
 PosixPath('../input/open-problems-multimodal/test_cite_inputs.h5')]

In [3]:
# define output directory
out_dir = Path('../working/')
out_dir

PosixPath('../working')

In [4]:
# Helper Functions
# field is an np.ndarray only containing zeros and ones
def get_naive_n_tuples(field, max_depth):
    naive_num_tuples = []
    #append one-tuples (=number of elements) to naive tuple count
    naive_num_tuples.append(field.sum(dim=1))
    
    # iterativeley calculate a new field. 1 in the field means, beginning 
    # of n tuple
    prev_field = field.clone().detach()
    for i in range(max_depth):
        prev_field = prev_field[:,:-1] * field[:,i+1:]
        naive_num_tuples.append(prev_field.sum(dim=1))
        
        #if not prev_field.any():
        #    return naive_num_tuples
            
    return naive_num_tuples

# naive n_tuples adds 2 2-tuple counts for every 3-tuple and so on
# this function removes that
def clean_naive_n_tuples(n_tuples):
    reversed_n_tuples = list(reversed(n_tuples))
    # add first element
    reverse_clean_tuples = []
    for i in range(len(reversed_n_tuples)):
        reverse_clean_tuples.append(reversed_n_tuples[i].clone().detach())
        
        # correct number
        for j in range(i):
            reverse_clean_tuples[i] -= (i+1-j) * reverse_clean_tuples[j] 
        
    return list(reversed(reverse_clean_tuples))

In [5]:
# the features we want to calculate for every cell
target_features = {
    'count_non_zero': torch.zeros(105942, dtype=torch.int32, device=device),
    'max_value': torch.zeros(105942, dtype=torch.float64, device=device),
    'min_value': torch.zeros(105942, dtype=torch.float64, device=device),
    'sum_values': torch.zeros(105942, dtype=torch.float64, device=device),
    'mean_non_zero': torch.zeros(105942, dtype=torch.float64, device=device),
    #'std_dev_non_zero': torch.zeros(105942, dtype=torch.float64, device=device),
    # number of consecutive non-zero elements / n_tupels
    '1_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '2_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '3_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '4_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '5_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '6_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '7_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '8_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '9_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '10_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '11_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '12_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '13_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '14_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '15_tups': torch.zeros(105942, dtype=torch.int32, device=device),
    '16_tups': torch.zeros(105942, dtype=torch.int32, device=device),
}

In [6]:
for f_name in f_list:
    print(f"### OPERATING ON: {f_name} ###")
    # Get properties of dataset
    with h5py.File(f_name) as f:
        cell_names = [a.decode('utf-8') for a in f[f_name.stem + '/axis1']]
        NUM_FEATURES = len(f[f_name.stem + '/axis0'])
        
    NUM_CELLS = len(cell_names)
    CHUNK_SIZE = int(200_000_000 / (NUM_FEATURES))

    # the features we want to calculate for every cell
    target_features = {
        'count_non_zero': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        'max_value': torch.zeros(NUM_CELLS, dtype=torch.float64, device=device),
        'min_value': torch.zeros(NUM_CELLS, dtype=torch.float64, device=device),
        'sum_values': torch.zeros(NUM_CELLS, dtype=torch.float64, device=device),
        'mean_non_zero': torch.zeros(NUM_CELLS, dtype=torch.float64, device=device),

        # We would like to calc std_dev but currently pytorch does not support it
        #'std_dev_non_zero': torch.zeros(NUM_CELLS, dtype=torch.float64, device=device),

        # number of consecutive non-zero elements / n_tupels
        '1_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '2_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '3_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '4_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '5_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '6_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '7_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '8_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '9_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '10_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '11_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '12_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '13_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '14_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '15_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
        '16_tups': torch.zeros(NUM_CELLS, dtype=torch.int32, device=device),
    }
    
    # iterate over dataset calculating features
    for i in range(int(np.ceil(NUM_CELLS/CHUNK_SIZE))):
        ###### SPECIFICATION WHAT DATA TO LOAD AND LOADING OF DATA ONTO DEVICE (CPU OR CUDA) #########
        S_INDEX = i * CHUNK_SIZE
        E_INDEX = (i+1) * CHUNK_SIZE

        # load data and send to torch device
        with h5py.File(f_name) as f:
            data = torch.tensor(f[f_name.stem + '/block0_values'][S_INDEX : E_INDEX], device=device)

            
        gc.collect()
        ##### CALCULATION OF FEATURES #####
        target_features['count_non_zero'][S_INDEX:E_INDEX] = data.gt(0).sum(dim=1)
        target_features['max_value'][S_INDEX:E_INDEX] = data.max(dim=1)[0]
        target_features['min_value'][S_INDEX:E_INDEX] = data.min(dim=1)[0]
        target_features['sum_values'][S_INDEX:E_INDEX] = data.sum(dim=1)


        # set zero values to nan (this helps in some computations 
        # e. g. when computation mean of non zero values)
        data[torch.eq(data, 0)] = torch.nan

        target_features['mean_non_zero'][S_INDEX:E_INDEX] = data.nanmean(dim=1)

        # missing implementation in pytorch (is on their todo)
        #target_features['std_dev_non_zero'][S_INDEX:E_INDEX] = data.nanstd(dim=1)

        naive_n_tuples = get_naive_n_tuples(data.gt(0), 15)
        clean_tuples = clean_naive_n_tuples(naive_n_tuples)

        target_features['1_tups'][S_INDEX:E_INDEX] = clean_tuples[0]
        target_features['2_tups'][S_INDEX:E_INDEX] = clean_tuples[1]
        target_features['3_tups'][S_INDEX:E_INDEX] = clean_tuples[2]
        target_features['4_tups'][S_INDEX:E_INDEX] = clean_tuples[3]
        target_features['5_tups'][S_INDEX:E_INDEX] = clean_tuples[4]
        target_features['6_tups'][S_INDEX:E_INDEX] = clean_tuples[5]
        target_features['7_tups'][S_INDEX:E_INDEX] = clean_tuples[6]
        target_features['8_tups'][S_INDEX:E_INDEX] = clean_tuples[7]
        target_features['9_tups'][S_INDEX:E_INDEX] = clean_tuples[8]
        target_features['10_tups'][S_INDEX:E_INDEX] = clean_tuples[9]
        target_features['11_tups'][S_INDEX:E_INDEX] = clean_tuples[10]
        target_features['12_tups'][S_INDEX:E_INDEX] = clean_tuples[11]
        target_features['13_tups'][S_INDEX:E_INDEX] = clean_tuples[12]
        target_features['14_tups'][S_INDEX:E_INDEX] = clean_tuples[13]
        target_features['15_tups'][S_INDEX:E_INDEX] = clean_tuples[14]
        target_features['16_tups'][S_INDEX:E_INDEX] = clean_tuples[15]
        # target_features['feature'][S_INDEX:E_INDEX] = 
    
    # calculations done, define index, build dataframe and safe as csv
    target_features_cpu = {key:value.cpu() for key, value in target_features.items()}
    df = pd.DataFrame(data=target_features_cpu, index=cell_names)
    df.to_csv(out_dir / (f_name.stem + '_cells_feature.csv'))

### OPERATING ON: ../input/open-problems-multimodal/test_multi_inputs.h5 ###
### OPERATING ON: ../input/open-problems-multimodal/train_cite_targets.h5 ###
### OPERATING ON: ../input/open-problems-multimodal/train_multi_inputs.h5 ###
### OPERATING ON: ../input/open-problems-multimodal/train_cite_inputs.h5 ###
### OPERATING ON: ../input/open-problems-multimodal/train_multi_targets.h5 ###
### OPERATING ON: ../input/open-problems-multimodal/test_cite_inputs.h5 ###
