# Data Production

In this notebook we will be generating the training set and the validation set. Each are a list of 1s or 0s, while the label is either a 1 (indicating odd) or 0 (even), representing the parity of the sequence. The entire point of this research paper is for models to learn algorithms that extrapolate outside of the training domain, thus the training data (as seen in the paper) will be all possible sequences from length 1 to 10 of 0s and 1s and the validation data will consist of 100 sequences of lengths 25, 50, 100, 1000, 2500, and 10000.

### 1. Imports

In [None]:
import numpy as np
import os

### 2. Generate Training Data


In [None]:
# Generate all binary sequences of length 1 to 10 and their final parity labels

def generate_training_data(L_train):
    sequences = []
    labels = []
    for length in range(1, L_train+1):
        for i in range(2**length):
            # binary representation of i, padded
            bits = [(i >> j) & 1 for j in range(length-1, -1, -1)]
            npbits = np.array(bits)
            sequences.append(npbits)
            parity = np.mod(np.sum(bits), 2)
            labels.append(parity)
    return np.array(sequences, dtype=object), np.array(labels, dtype=np.int64)


In [None]:
L_train = 10
train_X, train_Y = generate_training_data(L_train)

for x,y in zip(train_X, train_Y):
  print(x, y)

In [None]:
data_path = os.path.join('../data/training', 'train_data.npz')
np.savez_compressed(data_path, X=train_X, Y=train_Y)
print(f"Saved training data to {data_path}")

### 3. Generate Validation Data


In [None]:
# For each target length, generate num_val random sequences with parity labels.

def generate_validation_data(L_validation, L_sequence):
    sequences = []
    labels = []
    for _ in range(L_validation):
        bits = []
        for _ in range(L_sequence):
            bit = np.random.choice([0, 1])
            bits.append(bit)
        npbits = np.array(bits)
        sequences.append(npbits)
        parity = np.mod(np.sum(bits), 2)
        print("current seq: ", npbits, "parity: ", parity)
        labels.append(parity)
    return np.array(sequences, dtype=object), np.array(labels, dtype=np.int64)

In [None]:
np.random.seed(0)

val_len = 100
seq_len = [25, 50, 100, 250, 1000, 2500, 10000]
validation_data = {seq_len[i]: generate_validation_data(val_len, seq_len[i]) for i in range(len(seq_len))}

In [None]:
for vl, vals in validation_data.items():
  data_path = os.path.join('../data/validation', f'val_data_{vl}.npz')
  np.savez_compressed(data_path, X=vals[0], Y=vals[1])
  print(f"Saved training data to {data_path}")