In [None]:
def process_data(DATASET_FILE_PATH, DATASET_SHEET_TITLE, GRANULARITY, SMOOTHING, ATTEMPT_NAME, VAL_PERCENT, 
                 TEST_PERCENT, PAST_HISTORY, FUTURE_TARGET, STEP_SIZE_SLIDING_WINDOW, BATCH_SPLITS, EPOCHS, 
                 SHUFFLE_BUFFER_SIZE, MEAN):
    raw_data = load_dataset(DATASET_FILE_PATH, DATASET_SHEET_TITLE, GRANULARITY)
    indexes, features, ground_truth = split_data(raw_data, GRANULARITY, SMOOTHING, MEAN)
    plot_dataset(features, ground_truth, indexes)
    x_train, y_train, x_val, y_val, x_test, y_test = slice_data(indexes, features, ground_truth, VAL_PERCENT, 
                                                                TEST_PERCENT, PAST_HISTORY, FUTURE_TARGET, 
                                                                STEP_SIZE_SLIDING_WINDOW, GRANULARITY)
    batched_train_data, batched_val_data, batched_test_data = batch_data(x_train, y_train, x_val, y_val, 
                                                                        x_test, y_test, BATCH_SPLITS, EPOCHS,
                                                                        SHUFFLE_BUFFER_SIZE)
    return indexes, ground_truth, x_train, x_val, batched_train_data, batched_val_data, batched_test_data

In [None]:
# Definitions

# Data layout in the xlsx files
columns_data = ['1' ,'2', '3', '4', '5', '6', '7', '8', 'N/A_1', 'N/A_2', 'angle', 'time', 'session']
columns_features_considered = columns_data[:8]
column_ground_truth = columns_data[10]
# Note that we ignore the 'time' column. That makes our data slightly imprecise as there are tiny, 
# TINY differences in time intervals in the real data (not worth modeling). Each timestep represents 
# 1 millisecond, 0.001 second. 

In [None]:
def load_dataset(DATASET_FILE_PATH, DATASET_SHEET_TITLE, GRANULARITY):
    # Read sheet 1 (table of contents), find index of entry with correct title, then load the corresponding excel sheet
    table_of_contents = pd.read_excel(DATASET_FILE_PATH, sheet_name=0, header=None)
    sheet_index = table_of_contents[table_of_contents[0] == f"{DATASET_SHEET_TITLE}_raw_data"][0].index[0]
    sheet_data = pd.read_excel(DATASET_FILE_PATH, sheet_name=sheet_index + 1, header=None)
    sheet_data.columns = columns_data
    return sheet_data

In [None]:
def mean_observations(features):
    features_len = len(features)
    observations_len = len(features.iloc[0])
    return pd.DataFrame([(sum(features.iloc[i]) / observations_len) for i in range(0, features_len)])

def split_data(raw_data, GRANULARITY, SMOOTHING, MEAN):
    indexes = range(0, len(raw_data), 1)[::GRANULARITY] # Each timestep is a millisecond
    features = raw_data[columns_features_considered][::GRANULARITY].ewm(span=SMOOTHING).mean()
    if(MEAN): features = mean_observations(features)
    ground_truth = pd.DataFrame(raw_data[column_ground_truth][::GRANULARITY]).ewm(span=SMOOTHING).mean()
    return indexes, features, ground_truth

In [None]:
def plot_dataset(features, ground_truth, indexes):
    features.plot(subplots=True)
    plt.show()
    plt.plot(indexes, features.values)
    plt.show()
    ground_truth.plot()
    plt.show()

In [None]:
# Create array of all sliding windows of the data
def multivariate_data(dataset_features, dataset_ground_truth, start_index, end_index, history_size,
                      target_size, step, granularity, single_step=False, print_index=False):
    data, labels = [], []
    start_index = start_index + history_size 
    if end_index is None:
        end_index = len(dataset_features) - target_size 
    if print_index: print("start")
    for i in range(start_index, end_index): # start 100, end 790. 
        if print_index: print("A", i,)
        indices = range(i-history_size, i, step) # range(0, 100) step size of 1          --- our sliding window
        data.append(dataset_features[indices]) # append new array that contains all values within our sliding window
        if single_step:
            labels.append(dataset_ground_truth[i+target_size])
        else:
            labels.append(dataset_ground_truth[i:i+target_size])
    return np.array(data), np.array(labels)


def slice_data(indexes, features, ground_truth, VAL_PERCENT, TEST_PERCENT, PAST_HISTORY, FUTURE_TARGET, 
               STEP_SIZE_SLIDING_WINDOW, GRANULARITY):
    dataset = features.values
    observations = len(dataset)
    train_split = int(observations * (1 - VAL_PERCENT - TEST_PERCENT))
    val_split = int(observations * (1 - VAL_PERCENT))
        
    x_train, y_train = multivariate_data(dataset, ground_truth.values, 0,
                                         train_split, PAST_HISTORY, FUTURE_TARGET, 
                                         STEP_SIZE_SLIDING_WINDOW, GRANULARITY, single_step = False, 
                                         print_index = False)
    x_val, y_val = multivariate_data(dataset, ground_truth.values, train_split, 
                                         val_split, PAST_HISTORY, FUTURE_TARGET, 
                                         STEP_SIZE_SLIDING_WINDOW, GRANULARITY, single_step=False, 
                                         print_index = False)
    x_test, y_test = multivariate_data(dataset, ground_truth.values, val_split, 
                                         None, PAST_HISTORY, FUTURE_TARGET, 
                                         STEP_SIZE_SLIDING_WINDOW, GRANULARITY, single_step=False, 
                                         print_index = False)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
def batch_data(x_train, y_train, x_val, y_val, x_test, y_test, BATCH_SPLITS, EPOCHS, SHUFFLE_BUFFER_SIZE):
    batched_train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    if SHUFFLE_BUFFER_SIZE == 0:
        batched_train_data = batched_train_data.batch(BATCH_SPLITS).repeat(EPOCHS)
    else:
        batched_train_data = batched_train_data.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SPLITS).repeat(EPOCHS)

    batched_val_data = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SPLITS).repeat(EPOCHS)
    batched_test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(1) # 1 batch, no repeat
    return batched_train_data, batched_val_data, batched_test_data