In [1]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from pathlib import PurePath
from shutil import copy

In [2]:
def Data2IQ(filepath):
    # Ref: https://github.com/dhruboroy29/MATLAB_Scripts/blob/6ded2d8d434239e3488ee79e02232dd7afff908c/Scripts/Data2IQ.m
    # Read IQ streams from data
    assert os.path.splitext(filepath)[1] == '.data' or os.path.splitext(filepath)[1] == '.bbs'
    comp = np.fromfile(filepath, dtype=np.uint16)
    I = comp[::2]
    Q = comp[1::2]
    try:
        assert len(I) == len(Q)
    except AssertionError as e:
        e.args += (filepath,42)
        raise

    # Sanity check of I and Q samples (>4096, or abruptly different from prev. sample?)
    for i in range(1,len(I)-1):
        if I[i]>4096 or abs(int(I[i])-int(I[i-1]))>2000 and abs(int(I[i])-int(I[i+1]))>1500:
            I[i] = I[i-1]
        if Q[i]>4096 or abs(int(Q[i])-int(Q[i-1]))>2000 and abs(int(Q[i])-int(Q[i+1]))>1500:
            Q[i] = Q[i-1]

    return I,Q,len(I)

In [5]:
def get_time_series_data(fileloc, data_dir, win_len, fraction=None):
    filenames = []
    data = []
    labels = []
    seqs = []
    
    [[filenames.append(os.path.join(os.path.join(fileloc, filestr), filename))\
      for filename in os.listdir(os.path.join(fileloc, filestr))] for filestr in data_dir]
    
    [labels.append(int((os.path.basename(fname).split('_')[-2]).split('p')[0])) for fname in filenames]
    
    for file in filenames:
        I,Q,L = Data2IQ(file)
        
        windows = list(range(0, L - win_len + 1, win_len))
        seqs.append(len(windows))
        data_cut = np.zeros((len(windows), 2 * win_len), dtype=np.uint16)
        
        for k in range(len(windows)):
            data_cut[k, ::2] = I[windows[k]: windows[k] + win_len]
            data_cut[k, 1::2] = Q[windows[k]: windows[k] + win_len]
        
        data.append(data_cut)

    if fraction:
        indices = np.arange(len(filenames))
        subset_indices = random.sample(indices.tolist(), k = int(fraction*len(filenames)))
        filenames = [filenames[i] for i in subset_indices]
        data = [data[i] for i in subset_indices]
        labels = [labels[i] for i in subset_indices]
        seqs = [seqs[i] for i in subset_indices]
        
    return filenames, data, labels, seqs

In [6]:
window = 256
fileloc = '/scratch/sk7898/pedbike'
classes = ['Human', 'Bike']

humans_path = ['downstream/final_human_radial_full_cuts']
bikes_path = ['downstream/final_bike_radial_full_cuts']
data_dirs = [humans_path, bikes_path]

data_labels = [0, 1]
val_split = 0.1
test_split = 0.1

filenames = []
data = []
labels = []
seqs = []

for label in data_labels:
    f, d, l, s = get_time_series_data(fileloc, data_dirs[label], window)
    filenames += f
    data += d
    labels += l
    seqs += s

In [7]:
outdir = fileloc
path_prefix = 'window_256'

indices = np.arange(len(filenames))

X_train, X_val, y_train, y_val, indices_train, indices_val, seqs_train, seqs_val = train_test_split(data, labels,\
                                                                                                    indices, seqs,\
                                                                                                    test_size=val_split,\
                                                                                                    random_state=42)

X_train, X_test, y_train, y_test, indices_train, indices_test, seqs_train, seqs_test = train_test_split(X_train, y_train,\
                                                                                                        indices_train,\
                                                                                                        seqs_train,\
                                                                                                        test_size=test_split,\
                                                                                                        random_state=42)

files_train = [filenames[i] for i in indices_train]
files_val = [filenames[i] for i in indices_val]
files_test = [filenames[i] for i in indices_test]

In [12]:
assert len(X_train) == len(y_train) == len(seqs_train)
assert len(X_test) == len(y_test) == len(seqs_test)
assert len(X_val) == len(y_val) == len(seqs_val)

In [8]:
#Split the original data into train, val and test
outdir = '/scratch/sk7898/pedbike/downstream'
prefix = 'downstream'

train_dir = os.path.join(outdir, path_prefix, prefix + "_train")
val_dir = os.path.join(outdir, path_prefix, prefix + "_val")
test_dir = os.path.join(outdir, path_prefix, prefix + "_test")

for cls in classes:
    os.makedirs(os.path.join(train_dir,cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir,cls), exist_ok=True)
    os.makedirs(os.path.join(test_dir,cls), exist_ok=True)
    
for tr in files_train:
    cur_class = PurePath(tr).name.split('_')[3]
    copy(tr, os.path.join(train_dir, cur_class))

for val in files_val:
    cur_class = PurePath(val).name.split('_')[3]
    copy(val, os.path.join(val_dir, cur_class))
    
for tst in files_test:
    cur_class = PurePath(tst).name.split('_')[3]
    copy(tst, os.path.join(test_dir, cur_class))

In [9]:
#Save the time series data for train, val and test
outdir = '/scratch/sk7898/pedbike/window_256/'
path_prefix = 'downstream_time'

os.makedirs(os.path.join(outdir, path_prefix), exist_ok=True)

# Save train data
np.save(os.path.join(outdir, path_prefix, "train.npy"), X_train)
np.save(os.path.join(outdir, path_prefix, "train_seqs.npy"), seqs_train)
np.save(os.path.join(outdir, path_prefix, "train_lbls.npy"), y_train)

# Save validation data
np.save(os.path.join(outdir, path_prefix, "val.npy"), X_val)
np.save(os.path.join(outdir, path_prefix, "val_seqs.npy"), seqs_val)
np.save(os.path.join(outdir, path_prefix, "val_lbls.npy"), y_val)

# Save test data
np.save(os.path.join(outdir, path_prefix, "test.npy"), X_val)
np.save(os.path.join(outdir, path_prefix, "test_seqs.npy"), seqs_val)
np.save(os.path.join(outdir, path_prefix, "test_lbls.npy"), y_val)

In [10]:
print('Total Dataset Size:', len(filenames))
print('Train Dataset Size:', len(files_train))
print('Validation Dataset Size:', len(files_val))
print('Test Dataset Size:', len(files_test))

Total Dataset Size: 955
Train Dataset Size: 773
Validation Dataset Size: 96
Test Dataset Size: 86


**Train**

*Humans: 374*
*Bikes: 398*
    
**Validation**

*Humans: 45*
*Bikes: 51*
    
**Test**

*Humans: 45*
*Bikes: 41*

In [8]:
#Rename the files
# fileloc = '/scratch/sk7898/pedbike/window_256/upstream'
# filestrs = ['Human/human_cuts_stft', 'Nonhuman/non_human_cuts_stft']
# classes = ['human', 'non_human']
# new_classes = ['Human', 'Nonhuman']

# for filestr, cur_class, new_class in zip(filestrs, classes, new_classes):
#     for f in os.listdir(os.path.join(fileloc, filestr)):
#         old_file = os.path.join(fileloc, filestr, f)
#         new_file = os.path.join(fileloc, filestr, os.path.basename(f).replace(cur_class, new_class))
#         os.rename(old_file, new_file)