# Data stages
01_raw <- Aligned and labeled TPA+RGB. Not handled by the project. 
02_intermediate <- Aligned and labeled TPA+RGB that was cropped. Crop parameters can vary. Not handled by the project.
03_procesed <- Intermediate cropped data after normalization. Input to the models.

# Processed data
I'm using dataset_npz_f100_fs0 consisting of three positive samples and one negative sample.

In [96]:
import numpy as np
import glob
import os

VERBOSE = True

DEST = os.path.join("..", "..", "data", "03_processed")
DATASET_NAME = "/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0"
pos_fp_list = glob.glob(os.path.join("..", "..", "data", "02_intermediate", DATASET_NAME, "*", "1", "*.npz"))
neg_fp_list = glob.glob(os.path.join("..", "..",  "data", "02_intermediate", DATASET_NAME, "*", "0", "*.npz"))

print("{} positive samples:".format(len(pos_fp_list)))
print(pos_fp_list)
print()
print("{} negative samples:".format(len(neg_fp_list)))
print(neg_fp_list)
del(pos_fp_list, neg_fp_list)

10 positive samples:
['/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1547_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1552_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1553_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1556_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1554_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1525_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1527_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1521_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1523_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1522_.npz']

18 nega

In form of the function:

In [45]:
def get_pos_neg_fp_lists(dataset_name):
    pos_fp_list = glob.glob(os.path.join("..", "..",  "data", "02_intermediate", dataset_name, "*", "1", "*.npz"))
    neg_fp_list = glob.glob(os.path.join("..",  "..", "data", "02_intermediate", dataset_name, "*", "0", "*.npz"))
    return pos_fp_list, neg_fp_list

Get all positive and negative samples (in lists).

In [46]:
pos_fp_list, neg_fp_list = get_pos_neg_fp_lists(DATASET_NAME)
print("{} positive samples:".format(len(pos_fp_list)))
print(pos_fp_list)
print()
print("{} negative samples:".format(len(neg_fp_list)))
print(neg_fp_list)

10 positive samples:
['/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1547_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1552_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1553_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1556_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1554_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1525_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1527_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1521_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1523_.npz', '/media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject1/1/20200508_1522_.npz']

18 nega

Now we need to read TPA and RGB samples. Let's read the keys in NumPy archives that contain our samples.

In [47]:
tmp_f = pos_fp_list[0]
print(list(np.load(tmp_f)))

['one_hot', 'frames', 'frame_shift', 'tpa_avg_timestamps', 'tpa_rgb_avg_timestamps', 'pad_first', 'pad_last', 'repeating_frames', 'array_ID121', 'array_ID122', 'array_ID123', 'timestamps_ID121', 'timestamps_ID122', 'timestamps_ID123', 'array_IDRGB', 'timestamps_IDRGB']


A sample in the project will have the following keys:

* `'one_hot'` < pos/neg one-hot-encoded label.
* `'frames'` < as in array\[frame_shift:frames+frame_shift\] that was used to extract patches.
* `'frame_shift'` < as in array\[frame_shift:frames+frame_shift\] that was used to extract patches.
* `'tpa_avg_timestamps'` < TPA timestamps averaged (per timestep).
* `'tpa_rgb_avg_timestamps'` < TPA+RGB timestamps averaged (per timestep).
* `'pad_first'` < Number of frames repeated at the beginning of the sequence (still frames).
* `'pad_last'` < Number of frames repeated at the end of the sequence (still frames).
* `'repeating_frames'` < True if repeating frames enabled.
* `'array_ID121'` < TPA sequence from view with ID 121.
* `'array_ID122'` < As above.
* `'array_ID123'` < As above.
* `'timestamps_ID121'` < Timestamps of TPA sequence from view with ID 121.
* `'timestamps_ID122'` < As above.
* `'timestamps_ID123'` < As above.
* `'array_IDRGB'` < RGB sequence.
* `'timestamps_IDRGB'` < Timestamps of RGB sequence.


Now let's load in some samples.

In [48]:
pos_samples = [np.load(f) for f in pos_fp_list]
neg_samples = [np.load(f) for f in neg_fp_list]
print(pos_samples[0])
del(pos_samples, neg_samples, pos_fp_list, neg_fp_list)

<numpy.lib.npyio.NpzFile object at 0x7f04cae1b898>


In [49]:
def samples_from_fp_list(fp_list):
    return [np.load(f) for f in fp_list]

In [50]:
pos_fp_list, neg_fp_list = get_pos_neg_fp_lists(DATASET_NAME)
pos_samples = samples_from_fp_list(pos_fp_list)
neg_samples = samples_from_fp_list(neg_fp_list)
print(pos_samples[0])

<numpy.lib.npyio.NpzFile object at 0x7f04cae06588>


In [51]:
sample = pos_samples[0]
tpa1 = sample['array_ID121']
print(tpa1.shape, tpa1.dtype)
tpa2 = sample['array_ID122']
print(tpa2.shape, tpa2.dtype)
tpa3 = sample['array_ID123']
print(tpa3.shape, tpa3.dtype)
rgb = sample['array_IDRGB']
print(rgb.shape, rgb.dtype)

(100, 32, 32) float16
(100, 32, 32) float16
(100, 32, 32) float16
(100, 299, 299, 3) uint8


TPA arrays in the intermediate state need expanding dimensions by one so that the last dimension is a channel instead of spatial dimension.

In [52]:
sample = pos_samples[0]
tpa1 = np.expand_dims(sample['array_ID121'],axis=-1)
print(tpa1.shape, tpa1.dtype)
tpa2 = np.expand_dims(sample['array_ID122'],axis=-1)
print(tpa2.shape, tpa2.dtype)
tpa3 = np.expand_dims(sample['array_ID123'],axis=-1)
print(tpa3.shape, tpa3.dtype)
rgb = sample['array_IDRGB']
print(rgb.shape, rgb.dtype)
del(tpa1, tpa2, tpa3, rgb)

(100, 32, 32, 1) float16
(100, 32, 32, 1) float16
(100, 32, 32, 1) float16
(100, 299, 299, 3) uint8


We will need to convert data to appropriate float-type as well as normalize it. Now, we will also handle reading the label and timestamps.

In [53]:
def read_tpa123_from_npz(npz_sample, dtype=np.float32):
    ids = ('121', '122', '123')
    return [np.expand_dims(npz_sample['array_ID{}'.format(id)],axis=-1).astype(dtype) for id in ids]

def read_rgb_from_npz(npz_sample):
    return npz_sample['array_IDRGB']

In [54]:
sample = pos_samples[0]
print('Label ',sample['one_hot'])
tpa1, tpa2, tpa3, rgb = *read_tpa123_from_npz(sample), read_rgb_from_npz(sample)
print(tpa1.shape, tpa1.dtype)
print(tpa2.shape, tpa2.dtype)
print(tpa3.shape, tpa3.dtype)
print(rgb.shape, rgb.dtype)
print('{} averaged TPA timestamps'.format(len(sample['tpa_avg_timestamps'])))
print('{} averaged TPA+RGB timestamps'.format(len(sample['tpa_rgb_avg_timestamps'])))

Label  [0 1]
(100, 32, 32, 1) float32
(100, 32, 32, 1) float32
(100, 32, 32, 1) float32
(100, 299, 299, 3) uint8
100 averaged TPA timestamps
100 averaged TPA+RGB timestamps


# Normalization
Normalize as in [20200507-im-normalization-tanh.ipynb](20200507-im-normalization-tanh.ipynb)

In [55]:
a = 1e-1
b = 30
scale = 50

def normalize_TPA(array, a, b, scale=1):
    return scale*np.tanh(a*(array-b))

E.g.: 

In [56]:
tmp = np.arange(18, 36, 2)
print('Raw')
print(tmp)
print('Normalized')
tmp = normalize_TPA(tmp, a, b, scale)
print(np.round(tmp, 2))

Raw
[18 20 22 24 26 28 30 32 34]
Normalized
[-41.68 -38.08 -33.2  -26.85 -19.    -9.87   0.     9.87  19.  ]


In [57]:
print("Mean:")
print(np.round(tpa1.mean(), 2), '>>', np.round(normalize_TPA(tpa1, a, b, scale).mean(), 2))
print(np.round(tpa2.mean(), 2), '>>', np.round(normalize_TPA(tpa2, a, b, scale).mean(), 2))
print(np.round(tpa3.mean(), 2), '>>', np.round(normalize_TPA(tpa3, a, b, scale).mean(), 2))
print("Deviation:")
print(np.round(tpa1.std(), 2), '>>', np.round(normalize_TPA(tpa1, a, b, scale).std(), 2))
print(np.round(tpa2.std(), 2), '>>', np.round(normalize_TPA(tpa2, a, b, scale).std(), 2))
print(np.round(tpa3.std(), 2), '>>', np.round(normalize_TPA(tpa3, a, b, scale).std(), 2))

Mean:
29.77 >> -1.17
29.77 >> -1.15
29.8 >> -1.01
Deviation:
0.19 >> 0.96
0.16 >> 0.81
0.18 >> 0.89


## It will be probably safer to perform standarization. :)

In [58]:
def get_TPA_mean_and_std(sample_list):
    data = read_tpa123_from_npz(sample_list[0], dtype=np.float16)[0][0][0][0][0]
    for sample in sample_list:
        tpa1, tpa2, tpa3 = read_tpa123_from_npz(sample, dtype=np.float16)
        tpas = np.concatenate([tpa1, tpa2, tpa3]).flatten()
        data = np.append(data, tpas)
    data = data.astype(np.float32)
    return data.mean(), data.std()

def standarize_TPA(array, mean, std):
    return (array - mean) / std

TPA_mean, TPA_std = get_TPA_mean_and_std(pos_samples + neg_samples)

print("Mean:")
print(np.round(tpa1.mean(), 2), '>>', np.round(standarize_TPA(tpa1, TPA_mean, TPA_std).mean(), 2))
print(np.round(tpa2.mean(), 2), '>>', np.round(standarize_TPA(tpa2, TPA_mean, TPA_std).mean(), 2))
print(np.round(tpa3.mean(), 2), '>>', np.round(standarize_TPA(tpa3, TPA_mean, TPA_std).mean(), 2))
print("Deviation:")
print(np.round(tpa1.std(), 2), '>>', np.round(standarize_TPA(tpa1, TPA_mean, TPA_std).std(), 2))
print(np.round(tpa2.std(), 2), '>>', np.round(standarize_TPA(tpa2, TPA_mean, TPA_std).std(), 2))
print(np.round(tpa3.std(), 2), '>>', np.round(standarize_TPA(tpa3, TPA_mean, TPA_std).std(), 2))

Mean:
29.77 >> 0.02
29.77 >> 0.04
29.8 >> 0.2
Deviation:
0.19 >> 1.05
0.16 >> 0.89
0.18 >> 0.97


We also want to perform normalization on RGB:

In [59]:
def standarize_RGB(rgb_sequence):
    return (rgb_sequence - rgb_sequence.mean()) / rgb_sequence.std()

In [60]:
print("Mean:")
print(np.round(rgb.mean(), 2), '>>', np.round(standarize_RGB(rgb).mean(), 2))
print("Deviation:")
print(np.round(rgb.std(), 2), '>>', np.round(standarize_RGB(rgb).std(), 2))

Mean:
125.59 >> -0.0
Deviation:
66.21 >> 1.0


To reduce the number of computations performed we probably want to perform these on RGB sequence resized to desired input shape (we assume that it's cropped to center when converting raw >> intermediate)

# Batch maker

Now we are ready to move from 02_intermediate to 03_processed that will be ready for training.

In [127]:
### BATCH MAKER START ###
# 1. Inputs
development_subjects =  ["subject1", "subject2"]
pos_fp_list, neg_fp_list = get_pos_neg_fp_lists(DATASET_NAME)
_, name = os.path.split(DATASET_NAME)
batch_maker_dest = os.path.join(DEST, name, "development")
ensure_path_exists(batch_maker_dest)
# 2. Leave only samples from the development subjects list!
pos_fp_list = filter_subjects_from_fp_list(pos_fp_list, development_subjects)
neg_fp_list = filter_subjects_from_fp_list(neg_fp_list, development_subjects)
# 3. Load in npz
pos_samples = samples_from_fp_list(pos_fp_list)
neg_samples = samples_from_fp_list(neg_fp_list)
# 4. Get mean and std for TPA normalization, RGB will be normalized per sequence.
print_if_verbose('Getting mean and std of TPA samples...')
TPA_mean, TPA_std = get_TPA_mean_and_std(pos_samples + neg_samples)
print_if_verbose('TPA mean: {:.2f} \nTPA std: {:.2f}'.format(TPA_mean, TPA_std))
# 5. Main loop
print_if_verbose('Processing and saving {} samples...'.format(len(pos_fp_list + neg_fp_list)))
for fp in pos_fp_list + neg_fp_list:
    print_if_verbose("Reading {}".format(fp))
    sample = np.load(fp)
    head, tail = os.path.split(fp)
    tpa1, tpa2, tpa3 = read_tpa123_from_npz(sample)
    rgb = standarize_RGB(read_rgb_from_npz(sample))
    #5A. TPA standarization according to pt. 4
    tpa1, tpa2, tpa3 = [standarize_TPA(t, TPA_mean, TPA_std) for t in [tpa1, tpa2, tpa3]]
    sample_keys = list(sample.keys())
    dict2save = {'array_ID121' : tpa1, 'array_ID122': tpa2, 'array_ID123' : tpa3, 'array_IDRGB' : rgb}
    for key in sample_keys: 
        if key not in dict2save.keys():
            dict2save[key] = sample[key]
    output = os.path.join(batch_maker_dest, str(int(np.argmax(sample['one_hot']))), tail)
    ensure_parent_exists(output)
    print_if_verbose("Writing {}".format(output))
    np.savez_compressed(output, dict2save)
print_if_verbose('Processed and saved {} samples.'.format(len(pos_fp_list + neg_fp_list)))
### BATCH MAKER END ###

Getting mean and std of TPA samples...
TPA mean: 29.76 
TPA std: 0.18
Processing and saving 28 samples...
Reading /media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1547_.npz
Writing ../../data/03_processed/20200508_aligned_labeled_npz_f100_fs0/development/1/20200508_1547_.npz
Reading /media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1552_.npz
Writing ../../data/03_processed/20200508_aligned_labeled_npz_f100_fs0/development/1/20200508_1552_.npz
Reading /media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1553_.npz
Writing ../../data/03_processed/20200508_aligned_labeled_npz_f100_fs0/development/1/20200508_1553_.npz
Reading /media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/subject2/1/20200508_1556_.npz
Writing ../../data/03_processed/20200508_aligned_labeled_npz_f100_fs0/development/1/20200508_1556_.npz
Reading /media/igor/DATA/PIPELINE/20200508_aligned_labeled_npz_f100_fs0/su

# Conclusion

In [126]:
def get_pos_neg_fp_lists(dataset_name):
    pos_fp_list = glob.glob(os.path.join("..",  "..",  "data", "02_intermediate", dataset_name, "*", "1", "*.npz"))
    neg_fp_list = glob.glob(os.path.join("..", "..",  "data", "02_intermediate", dataset_name, "*", "0", "*.npz"))
    return pos_fp_list, neg_fp_list

def samples_from_fp_list(fp_list):
    return [np.load(f) for f in fp_list]

def read_tpa123_from_npz(npz_sample, dtype=np.float32):
    ids = ('121', '122', '123')
    return [np.expand_dims(npz_sample['array_ID{}'.format(id)],axis=-1).astype(dtype) for id in ids]

def read_rgb_from_npz(npz_sample):
    return npz_sample['array_IDRGB']

a = 1e-1
b = 30

def normalize_TPA(array, a, b, scale=1):
    return scale*np.tanh(a*(array-b))

def standarize_RGB(rgb_sequence):
    return (rgb_sequence - rgb_sequence.mean()) / rgb_sequence.std()

def _split_path_into_components(fp):
    path = os.path.normpath(fp)
    return path.split(os.sep)

def filter_subjects_from_fp_list(fp_list, target_subjects):
    result = []
    for fp in fp_list:
        fp_s = _split_path_into_components(fp)
        subj_in = []
        for subj in target_subjects:
            if subj in fp_s:
                subj_in.append(True)
            else:
                subj_in.append(False)
        if any(subj_in):
            result.append(fp)
    return result

def print_if_verbose(msg):
    if VERBOSE:
        print(msg)
        
def ensure_path_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
def ensure_parent_exists(path):
    ensure_path_exists(os.path.dirname(path))
