In [1]:
import json
import os

import numpy as np
from tqdm import tqdm

In [2]:
dir_path = "data/arc2concept-aug-1000/train"

In [3]:
indices = np.load(f"{dir_path}/all__group_indices.npy")
x = np.load(f"{dir_path}/all__inputs.npy")
y = np.load(f"{dir_path}/all__labels.npy")
x_ids = np.load(f"{dir_path}/all__puzzle_identifiers.npy")
x_indices = np.load(f"{dir_path}/all__puzzle_indices.npy")

In [7]:
x_ids

array([      1,       2,       3, ..., 1191727, 1191728, 1191729],
      shape=(1191729,), dtype=int32)

In [4]:
def build_example_dicts(
    inputs: np.ndarray,           # [N_examples, seq_len]
    labels: np.ndarray,           # [N_examples, seq_len]
    puzzle_indices: np.ndarray,   # [N_puzzles + 1]
    puzzle_identifiers: np.ndarray,  # [N_puzzles]
):
    """
    Build a list of dicts, one per example, each with:
      - 'input':            1D or 2D array for that example
      - 'label':            same shape as 'input'
      - 'puzzle_identifier': scalar int ID for the puzzle this example belongs to
    """
    num_puzzles = puzzle_indices.shape[0] - 1
    # How many examples per puzzle
    examples_per_puzzle = np.diff(puzzle_indices)  # [N_puzzles]
    
    # Map each example index -> puzzle index
    puzzle_idx_for_example = np.repeat(
        np.arange(num_puzzles, dtype=np.int32),
        examples_per_puzzle
    )  # [N_examples]

    # Then map puzzle index -> puzzle_identifier
    example_puzzle_identifiers = puzzle_identifiers[puzzle_idx_for_example]  # [N_examples]

    os.makedirs("data/my-arc2concept-aug-1000", exist_ok=True)
    with open("data/my-arc2concept-aug-1000/train.jsonl", 'w') as file:
        for i in tqdm(range(inputs.shape[0])):
            out_puzzle = {
                    "x": inputs[i].tolist(),
                    "y": labels[i].tolist(),
                    "aug_puzzle_index": int(example_puzzle_identifiers[i]),
                    # placeholder to make work with dataloader
                    "colour_aug": [0],
                    "d8_aug": 0,
                    "example_idx": 0,
                }
            
            json.dump(out_puzzle, file)
            file.write("\n")


In [5]:
build_example_dicts(x, y, x_indices, x_ids)

100%|██████████| 5163277/5163277 [34:49<00:00, 2471.16it/s]
