In [1]:
import os
import random
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras import utils
from sklearn.model_selection import train_test_split
from keras.models import Model, Sequential
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization
from keras.layers import *

Using TensorFlow backend.


In [None]:
def get_data(fileloc):
    x_train = np.load(os.path.join(fileloc, "train.npy"))
    x_val = np.load(os.path.join(fileloc, "val.npy"))

    y_train = np.load(os.path.join(fileloc, "train_lbls.npy"))
    y_val = np.load(os.path.join(fileloc, "val_lbls.npy"))

    seqs_train = np.load(os.path.join(fileloc, "train_seqs.npy"))
    seqs_val = np.load(os.path.join(fileloc, "val_seqs.npy"))
    
    return x_train, x_val, y_train, y_val, seqs_train, seqs_val

In [None]:
def histedges_equalN(seq_lengths, n_bins):
    npt = len(seq_lengths)
    return np.interp(np.linspace(0, npt, n_bins + 1),
                     np.arange(npt),
                     np.sort(seq_lengths))

def element_to_bucket_id(x, buckets_min, buckets_max):
    seq_length = x.shape[0]
    conditions_c = np.logical_and(np.less_equal(buckets_min, seq_length),
                                  np.less(seq_length, buckets_max))
    bucket_id = np.min(np.where(conditions_c))
    return bucket_id

def pad_sequence(x, max_len=None, padding_value=0):
    orig_length = x.shape[0]
    new_x = np.zeros((max_len, 512), dtype=np.float64)
    new_x[0:orig_length,:] = x
    return new_x
    
class BucketedBatch(keras.utils.Sequence):            
    def _permute(self):
        #Shuffle the buckets
        self.b_ids = np.random.permutation(self.n_bins)
        
        # Shuffle bucket contents
        for key in self.b_ids:
            xbin = np.array(self.buckets[key]['x'])
            ybin = np.array(self.buckets[key]['y'])
            #print(xbin.shape)
            index_array = np.random.permutation(len(self.buckets[key]['x']))
            self.buckets[key]['x'] = xbin[index_array]
            self.buckets[key]['y'] = ybin[index_array]

    def on_epoch_end(self):
        self._permute()
    
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return self.n_bins
    
    def __init__(self, n_bins, data, labels, seq_lengths, padding=None, padding_value=None):
        bucket_sizes, bucket_boundaries = np.histogram(seq_lengths, bins = histedges_equalN(seq_lengths, n_bins))
        #print(bucket_sizes)
        #print(bucket_boundaries)

        data_buckets = dict()
        boundaries = list(bucket_boundaries)
        buckets_min = boundaries[:-1]
        buckets_max = boundaries[1:]
        buckets_max[n_bins-1] += 1
        #print(buckets_min)
        #print(buckets_max)
        
        for x, y in zip(data, labels):
            b_id = element_to_bucket_id(x, buckets_min, buckets_max)
            if padding:
                if x.shape[0] < buckets_max[b_id]:
                    max_len = buckets_max[b_id] - 1
                    x = pad_sequence(x, max_len=int(max_len), padding_value=padding_value)
                    
            if b_id in data_buckets.keys():
                data_buckets[b_id]['x'].append(x)
                data_buckets[b_id]['y'].append(y)
            else:
                data_buckets[b_id] = {} 
                data_buckets[b_id]['x'] = [x]
                data_buckets[b_id]['y'] = [y]    
    
        self.n_bins = n_bins
        self.buckets = data_buckets
        self._permute()
        
    def __getitem__(self, idx):
        key = self.b_ids[idx]
        return self.buckets[key]['x'], self.buckets[key]['y']

In [None]:
def val_generator(x_val, y_val):
    idx = 0
    while True:
        for x, y in zip(x_val, y_val):
            idx += 1 
            x = x.reshape(1, x.shape[0], x.shape[1])
            label = np.zeros((1,1), dtype=np.int16)
            label[0] = y
            if idx >= y_val.shape[0]:
                break
            return x, label

In [None]:
window = 256
split = 0.3
data_prefix = '/scratch/sk7898/pedbike/window_256'

if data_type == 'stft':
    upstream_data_dir = 'upstream_stft'
    downstream_data_dir = 'downstream_stft'
else:
    upstream_data_dir = 'upstream_time'
    downstream_data_dir = 'downstream_time'

u_train, u_val, u_y_train, u_y_val, u_seqs_train, u_seqs_val = get_data(upstream_data_dir)

d_train, d_val, d_y_train, d_y_val, d_seqs_train, d_seqs_val = get_data(downstream_data_dir)

u_train_gen = train_generator(n_bins, x_train, y_train, seq_lengths=seqs_train, padding=True, padding_value=0.0)
u_val_gen = val_generator(x_val, y_val)

In [None]:
batch_size = 64
u_n_bins = int(len(u_seqs_train)/batch_size)
d_n_bins = int(len(d_seqs_train)/batch_size)

u_train_gen = train_generator(u_n_bins, u_train, u_y_train, seq_lengths=u_seqs_train, padding=True, padding_value=0.0)
u_val_gen = val_generator(u_val, u_val)

d_train_gen = train_generator(d_n_bins, d_train, d_y_train, seq_lengths=d_seqs_train, padding=True, padding_value=0.0)
d_val_gen = val_generator(d_val, d_val)