# Load Phoneme Dataset and process it
Data is found here: https://www.timeseriesclassification.com/description.php?Dataset=PhonemeSpectra

Download data, unzip, and move to a folder in this location: '../Data/'

In [1]:
import scipy
import numpy as np

In [2]:
def convert_single_to_numpy(example):
    # where example = train[0][n]
    # returns the example reshaped into a numpy array of shape (200, 22)
    return example[0].view(dtype=np.float64).reshape((11,217))

def convert_label(example):
    # where example = train[0][n]
    # returns the example label decoded into a native string
    return example[1].decode("utf-8")

In [13]:
test = np.load('../Data/Phoneme/test_X.npy')
test_labels = np.load('../Data/Phoneme/test_y.npy')

In [14]:
num_elements = test.shape[0]

permuted_indices = np.random.permutation(num_elements)

split_arr1 = test[permuted_indices[:num_elements//2], :, :]
split_arr2 = test[permuted_indices[num_elements//2:], :, :]

split_lab1 = test_labels[permuted_indices[:num_elements//2]]
split_lab2 = test_labels[permuted_indices[num_elements//2:]]

# Display the shapes of the split arrays
print("Shape of split_arr1:", split_arr1.shape)
print("Shape of split_arr2:", split_arr2.shape)

# Display the shapes of the split arrays
print("Shape of split_arr1:", split_lab1.shape)
print("Shape of split_arr2:", split_lab2.shape)

Shape of split_arr1: (1676, 217, 11)
Shape of split_arr2: (1677, 217, 11)
Shape of split_arr1: (1676,)
Shape of split_arr2: (1677,)


In [18]:
with open('../Data/valid_X.npy', 'wb') as f:
    np.save(f, split_arr2)

with open('../Data/valid_y.npy', 'wb') as f:
    np.save(f, split_lab2)

In [56]:
labels = []
data_arrays = []

for data_point in train[0]:
    data_arrays.append(convert_single_to_numpy(data_point))
    labels.append(convert_label(data_point))

In [57]:
training_dataset = np.transpose(np.stack(data_arrays, axis=0), (0, 2, 1))
training_labels = np.asarray(labels)

In [58]:
training_dataset.shape

(3353, 217, 11)

In [59]:
no_nan_rows = np.isnan(training_dataset).sum(axis=(1, 2)) == 0
num_rows_no_nan = np.count_nonzero(no_nan_rows)
print("Number of rows without NaN values:", num_rows_no_nan)

Number of rows without NaN values: 3353


In [61]:
with open('../Data/Phoneme/test_X.npy', 'wb') as f:
    np.save(f, training_dataset)

# Get min and max values

In [4]:
# Load test, valid, and train datasets from Phoneme/test_x.npy
train = np.load('../Data/Phoneme/train_X.npy')
valid = np.load('../Data/Phoneme/valid_X.npy')
test = np.load('../Data/Phoneme/test_X.npy')

# Find max value in train, valid, and test datasets
max_train = np.max(train)
max_valid = np.max(valid)
max_test = np.max(test)

# Find min value in train, valid, and test datasets
min_train = np.min(train)
min_valid = np.min(valid)
min_test = np.min(test)

# Find global max and min
global_max = max(max_train, max_valid, max_test)
global_min = min(min_train, min_valid, min_test)

print("max_train:", max_train)
print("max_valid:", max_valid)
print("max_test:", max_test)

print("min_train:", min_train)
print("min_valid:", min_valid)
print("min_test:", min_test)

max_train: 126.76
max_valid: 172.96
max_test: 140.57
min_train: 0.0
min_valid: 0.0
min_test: 0.0


In [7]:
# print shapes of the train dataset
print("(Num examples, num time steps, num channels)")
print("Shape of train dataset:", train.shape)
print("Shape of valid dataset:", valid.shape)
print("Shape of test dataset:", test.shape)

(Num examples, num time steps, num channels)
Shape of train dataset: (3315, 217, 11)
Shape of valid dataset: (1677, 217, 11)
Shape of test dataset: (1676, 217, 11)


# Reshape data and resave it

In [8]:
# Load test, valid, and train datasets from Phoneme/test_x.npy
train = np.load('../Data/Phoneme/train_X.npy')
valid = np.load('../Data/Phoneme/valid_X.npy')
test = np.load('../Data/Phoneme/test_X.npy')

# Reshape test_X, test_y and all to be (N, C, T) where N is the number of examples, C is the number of channels, and T is the number of time steps
train = np.reshape(train, (train.shape[0], train.shape[2], train.shape[1]))
valid = np.reshape(valid, (valid.shape[0], valid.shape[2], valid.shape[1]))
test = np.reshape(test, (test.shape[0], test.shape[2], test.shape[1]))

print("Shape of train dataset:", train.shape)
print("Shape of valid dataset:", valid.shape)
print("Shape of test dataset:", test.shape)

# Save
np.save('../Data/Phoneme/train_X.npy', train)
np.save('../Data/Phoneme/valid_X.npy', valid)
np.save('../Data/Phoneme/test_X.npy', test)

Shape of train dataset: (3315, 11, 217)
Shape of valid dataset: (1677, 11, 217)
Shape of test dataset: (1676, 11, 217)
