In [1]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
from pathlib import PurePath
from shutil import copy, rmtree

In [2]:
def Data2IQ(filepath):
    # Ref: https://github.com/dhruboroy29/MATLAB_Scripts/blob/6ded2d8d434239e3488ee79e02232dd7afff908c/Scripts/Data2IQ.m
    # Read IQ streams from data
    assert os.path.splitext(filepath)[1] == '.data' or os.path.splitext(filepath)[1] == '.bbs'
    comp = np.fromfile(filepath, dtype=np.uint16)
    I = comp[::2]
    Q = comp[1::2]
    try:
        assert len(I) == len(Q)
    except AssertionError as e:
        e.args += (filepath,42)
        raise

    # Sanity check of I and Q samples (>4096, or abruptly different from prev. sample?)
    for i in range(1,len(I)-1):
        if I[i]>4096 or abs(int(I[i])-int(I[i-1]))>2000 and abs(int(I[i])-int(I[i+1]))>1500:
            I[i] = I[i-1]
        if Q[i]>4096 or abs(int(Q[i])-int(Q[i-1]))>2000 and abs(int(Q[i])-int(Q[i+1]))>1500:
            Q[i] = Q[i-1]

    return I,Q,len(I)

In [3]:
def get_time_series_data(fileloc, data_dir, label, win_len, fraction=None):
    filenames = []
    data = []
    labels = []
    seqs = []
    
    [[filenames.append(os.path.join(os.path.join(fileloc, filestr), filename))\
      for filename in os.listdir(os.path.join(fileloc, filestr))] for filestr in data_dir]
    
    for file in filenames:
        I,Q,L = Data2IQ(file)
        
        windows = list(range(0, L - win_len + 1, win_len))
        labels.append(label)
        seqs.append(len(windows))
        data_cut = np.zeros((len(windows), 2 * win_len), dtype=np.uint16)
        
        for k in range(len(windows)):
            data_cut[k, ::2] = I[windows[k]: windows[k] + win_len]
            data_cut[k, 1::2] = Q[windows[k]: windows[k] + win_len]
        
        data.append(data_cut)

    if fraction:
        indices = np.arange(len(filenames))
        subset_indices = random.sample(indices.tolist(), k = int(fraction*len(filenames)))
        filenames = [filenames[i] for i in subset_indices]
        data = [data[i] for i in subset_indices]
        labels = [labels[i] for i in subset_indices]
        seqs = [seqs[i] for i in subset_indices]
        
    return filenames, data, labels, seqs

In [4]:
window = 256
fileloc = '/scratch/sk7898/pedbike/'

add_humans = None
add_non_humans = ['downstream/final_bike_radial_full_cuts']    
classes = ['Human', 'Nonhuman']
data_labels = [0, 1]
add_fraction = 0.1

#Example filenames in stft: 
#Human_fft_8_win_14_label_0.data
#Nonhuman_fft_96_win_24_label_1.data
filenames = []
data = []
labels = []
seqs = []
outdir = fileloc

humans_path = ['upstream/Targets/arc_1 (Humans_Gym balls)/Human/', \
               'upstream/Targets/bv_4 (Humans_Cars)/Human/',\
               'upstream/Targets/ceiling_238_10 (Humans_Gym balls)/Human/',\
               'upstream/Targets/combined_5 (Humans_Dogs)/11-30-2011/Human/',\
               'upstream/Targets/combined_5 (Humans_Dogs)/Human/',\
               'upstream/Targets/kh_3 (Humans_Gym balls)/Human/',\
               'upstream/Targets/prb_2 (Humans_Gym balls)/Human/',\
               'upstream/Targets/Parking garage orthogonal (Humans)/',\
               'upstream/Targets/Parking garage radial (Humans)/']

non_humans_path = ['upstream/Targets/arc_1 (Humans_Gym balls)/Ball/',\
                   'upstream/Targets/bv_4 (Humans_Cars)/Car/',\
                   'upstream/Targets/ceiling_238_10 (Humans_Gym balls)/Ball/',\
                   'upstream/Targets/combined_5 (Humans_Dogs)/Dog/',\
                   'upstream/Targets/kh_3 (Humans_Gym balls)/Dog/',\
                   'upstream/Targets/prb_2 (Humans_Gym balls)/Dog/',\
                   'upstream/Targets/osu_farm_meadow_may24-28_2016_subset_113 (Cattle)/',\
                   'upstream/Targets/Radar_site1_hilltop (Cattle)/',\
                   'upstream/Targets/Radar_site2_creamery_subset_113 (Cattle)/']

data_dirs = [humans_path, non_humans_path]

for label in data_labels:
    f, d, l, s = get_time_series_data(fileloc, data_dirs[label], label, window)
    filenames += f
    data += d
    labels += l
    seqs += s
    
f, d, l, s = get_time_series_data(fileloc, add_non_humans, 1, window, fraction=add_fraction)
filenames += f 
data += d
labels += l
seqs += s

In [5]:
val_split = 0.2

indices = np.arange(len(filenames))
X_train, X_val, y_train, y_val, indices_train, indices_val, seqs_train, seqs_val = train_test_split(data, labels,\
                                                                                                    indices, seqs, \
                                                                                                    test_size=val_split,\
                                                                                                    random_state=42)
        
files_train = [filenames[i] for i in indices_train]
files_val = [filenames[i] for i in indices_val]

In [6]:
#Split the original data into train and val
outdir = '/scratch/sk7898/pedbike/upstream'
prefix = 'upstream'

train_dir = os.path.join(outdir, prefix + "_train")
val_dir = os.path.join(outdir, prefix + "_val")

for cls in classes:
    os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cls), exist_ok=True)

train_count = 1
for tr in files_train:
    if labels_train[train_count-1] == 1:
        cur_class = 'Nonhuman'
    else:
        cur_class = 'Human'
    fname = cur_class +'_time_'+ str(val_count) +'_label_'+ str(y_val[val_count-1]) +'.data'
    copy(tr, os.path.join(train_dir, cur_class, fname))
    train_count += 1

val_count = 1
for val in files_val:
    if labels_val[val_count-1] == 1:
        cur_class = 'Nonhuman'
    else:
        cur_class = 'Human'
    fname = cur_class +'_time_'+ str(val_count) +'_label_'+ str(y_val[val_count-1]) +'.data'
    copy(val, os.path.join(val_dir, cur_class, fname))
    val_count += 1

In [7]:
#Save the time series data for train and val
outdir = '/scratch/sk7898/pedbike/window_256/'
path_prefix = 'upstream_time'
os.makedirs(os.path.join(outdir, path_prefix), exist_ok=True)

# Save train data
np.save(os.path.join(outdir, path_prefix, "train.npy"), X_train)
np.save(os.path.join(outdir, path_prefix, "train_seqs.npy"), seqs_train)
np.save(os.path.join(outdir, path_prefix, "train_lbls.npy"), y_train)

# Save validation data
np.save(os.path.join(outdir, path_prefix, "val.npy"), X_val)
np.save(os.path.join(outdir, path_prefix, "val_seqs.npy"), seqs_val)
np.save(os.path.join(outdir, path_prefix, "val_lbls.npy"), y_val)