In [1]:
# Constants
INPUT_SIZE = 4096 # [512, 1024, 2048, 4096]

SPLITS = (0, 1, 2, 3, 4, 5, 6, 7)
NUMBER_OF_NEW_SPLITS = 4
SHIFTS = [0, 512, 1024] # Default is 0
NUMBER_OF_TRAIN_SAMPLES = 10_000
PER_SPLIT = 100

In [2]:
# Load datasets
import pandas as pd
import numpy as np
from random import sample

train = []
tests = []
for split in SPLITS:
    train += pd.read_pickle("data/split_%s_train.pickle.gzip" % split).values.tolist()
    tests += pd.read_pickle("data/split_%s_test.pickle.gzip" % split).values.tolist()
    print("Loaded %s" % split)

# FIND CLASS DIST
train = sample([(t[6::], (1, 0)) for t in train], NUMBER_OF_NEW_SPLITS * PER_SPLIT)
test = sample([(t[0], t[6::], (0, 1) if t[3] == 1 else (1, 0)) for t in tests if (t[3] == 1 and abs(t[5]) in SHIFTS and abs(t[5]) <= INPUT_SIZE) or t[3] == 0], NUMBER_OF_NEW_SPLITS * PER_SPLIT)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Loaded 0
Loaded 1
Loaded 2
Loaded 3
Loaded 4
Loaded 5
Loaded 6
Loaded 7


### Setup Tests

In [3]:
# Balance tests
from random import shuffle

new_test_pos = [t for t in test if np.argmax(t[2]) == 1]
new_test_neg = [(t[1], t[2]) for t in test if np.argmax(t[2]) == 0]
shuffle(new_test_neg)

waves = list(set([t[0] for t in new_test_pos]))
shuffle(waves)
new_waves = []
split = len(waves)//NUMBER_OF_NEW_SPLITS
print(len(waves))

for _ in range(NUMBER_OF_NEW_SPLITS):
    new_waves.append(waves[:split])
    del waves[:split]

tests = []
neg_split = len(new_test_neg)//NUMBER_OF_NEW_SPLITS
for i, waves in enumerate(new_waves):
    tests.append([(t[1], t[2]) for t in new_test_pos if t[0] in waves])
    neg_samples_len = min([len(tests[i]), neg_split])
    tests[i] += new_test_neg[:neg_samples_len]
    del new_test_neg[:neg_samples_len]


31


In [4]:
# Separate only the x values
x_tests = [[t[0] for t in test] for test in tests]
y_tests = [[t[1] for t in test] for test in tests]

print(len(y_tests[0]))

24


In [5]:
# Select only the center of the test segments
new_xtests = []

if INPUT_SIZE == 4096:
    new_xtests = [x_test for x_test in x_tests]
else:
    for x_test in x_tests:
        center_point = len(x_test[0])//2
        starting_point = center_point - (INPUT_SIZE//2)
        ending_point = center_point + (INPUT_SIZE//2)

        x_test = [t[starting_point:ending_point] for t in x_test]

        for t in x_test:
            assert len(t) == INPUT_SIZE

        new_xtests.append(x_test)

x_tests = new_xtests

In [6]:
print(len(x_tests))
print([len(x) for x in x_tests])
print(len(x_tests[0][0]))

print(len(y_tests))
print([len(y) for y in y_tests])
print(len(y_tests[0][0]))

4
[24, 16, 24, 22]
4096
4
[24, 16, 24, 22]
2


### Setup Train

In [7]:
# Separate xs
x_trains = [t[0] for t in train]
x_trains = [train[split::NUMBER_OF_NEW_SPLITS][:100] for split in range(NUMBER_OF_NEW_SPLITS)]

In [8]:
# Select only the center of the train segments
new_xtrains = []

if INPUT_SIZE == 4096:
    new_xtrains = [x_train for x_train in x_trains]
else:
    for x_train in x_trains:
        center_point = len(x_train[0])//2
        starting_point = center_point - (INPUT_SIZE//2)
        ending_point = center_point + (INPUT_SIZE//2)

        x_train = [t[starting_point:ending_point] for t in x_train]

        for t in x_train:
            assert len(t) == INPUT_SIZE

        new_xtrains.append(x_train)

x_trains = new_xtrains

In [9]:
print(len(x_trains))
print([len(x) for x in x_trains])
print(len(x_trains[0][0]))

4
[100, 100, 100, 100]
2


In [10]:
def min_max(lst):
    lst = np.array(lst)
    strain_min, strain_max = np.min(lst), np.max(lst)
    min_max_diff = strain_max - strain_min
    lst -= strain_min
    lst /= min_max_diff
    return lst.tolist()

x_tests = [[min_max(x) for x in split] for split in x_tests]
x_train = [[min_max(x) for x in split] for split in x_trains]

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [None]:
# Save to big pickle file
from pickle import dump

dump((x_tests, y_tests), open("data/test_%s.pickle" % INPUT_SIZE, "wb+"))
dump(x_trains, open("data/train_%s.pickle" % INPUT_SIZE, "wb+"))