In [3]:
import os
import numpy as np
import pandas as pd
import pickle

In [4]:
def save_pickle(data, file_name):
    """
    Saves data as pickle format
    """
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)
    return None

In [5]:
cwd = os.getcwd()
parent_wd = cwd.replace('/preprocessing', '')
hold_feature_path = parent_wd + '/raw_data/HoldFeature2016.csv'
training_set_path = parent_wd + '/preprocessing/training_set'
dev_set_path = parent_wd + '/preprocessing/dev_set'
test_set_path = parent_wd + '/preprocessing/test_set'

In [6]:
with open(training_set_path, 'rb') as f:
    training_set = pickle.load(f)
with open(dev_set_path, 'rb') as f:
    dev_set = pickle.load(f)
with open(test_set_path, 'rb') as f:
    test_set = pickle.load(f)

In [7]:
features = pd.read_csv(hold_feature_path, dtype=str)

# convert features from pd dataframe to dictionary
feature_dict = {}
for index in features.index:
    feature_item = features.loc[index]
    feature_dict[(int(feature_item['X_coord']), int(feature_item['Y_coord']))] = np.array(
        list(feature_item['Difficulties'])).astype(int)

In [23]:
feature_matrix_normalized = np.zeros((11, 18, 6))
for key, values in feature_dict.items():
    feature_matrix_normalized[key[0], key[1], :] = values

In [24]:
for i in range(6):
    mu = np.mean(feature_matrix_normalized[:, :, i])
    std = np.std(feature_matrix_normalized[:, :, i])
    feature_matrix_normalized[:, :, i] -= mu
    feature_matrix_normalized[:, :, i] /= std

In [87]:
def prep_for_cnn(input_set, feature_matrix_normalized):
    n_sample = len(input_set['Y'])
    X_cnn = np.zeros((n_sample, 11, 18, 9))
    for i in range(n_sample):
        coordinate = input_set['X'][i][0:int(input_set['tmax'][i]), 6:8]
        X_cnn[i, coordinate[:, 0].astype(int), coordinate[:, 1].astype(int), 0] = 1
        [start_arg] = np.where(input_set['X'][i][0:int(input_set['tmax'][i]), 8] == 1)
        [end_arg] = np.where(input_set['X'][i][0:int(input_set['tmax'][i]), 9] == 1)
        X_cnn[i, coordinate[start_arg][:, 0].astype(int), coordinate[start_arg][:, 1].astype(int), 1] = 1
        X_cnn[i, coordinate[end_arg][:, 0].astype(int), coordinate[end_arg][:, 1].astype(int), 2] = 1
        X_cnn[i, :, :, 3:] = feature_matrix_normalized
    output_set = input_set
    output_set['X'] = X_cnn
    return output_set

In [88]:
training_set_cnn = prep_for_cnn(training_set, feature_matrix_normalized)
dev_set_cnn = prep_for_cnn(dev_set, feature_matrix_normalized)
test_set_cnn = prep_for_cnn(test_set, feature_matrix_normalized)

In [90]:
training_cnn_set_path = parent_wd + '/preprocessing/training_set_cnn'
dev_cnn_set_path = parent_wd + '/preprocessing/dev_set_cnn'
test_cnn_set_path = parent_wd + '/preprocessing/test_set_cnn'
save_pickle(training_set_cnn, training_cnn_set_path)
save_pickle(dev_set_cnn, dev_cnn_set_path)
save_pickle(test_set_cnn, test_cnn_set_path)