# Prepare BARC data for training

## Imports

In [1]:
import json
import numpy as np
import pandas as pd
import random
from tqdm.auto import tqdm

from arc25.plot import plot_task
from arc25.utils import write_json, load_json

In [None]:
filepath = '/mnt/hdd0/Kaggle/arc25/data/200k_HEAVY_gpt4o-description-gpt4omini-code_generated_problems/data_100k.jsonl'
df = pd.read_json(filepath, lines=True)
df.head()

In [None]:
def sample_n_train(max_n=None):
    # same distribution as the original datasets
    n_and_count = [(2, 355), (3, 1281), (4, 433), (5, 123), (6, 43), (7, 17), (8, 6), (10, 2)]
    if max_n is not None:
        n_and_count = [(n, count) for n, count in n_and_count if n <= max_n]
    weights = [count for n, count in n_and_count]
    total = sum(weights)
    probabilities = [count / total for count in weights]
    n_samples = [n for n, count in n_and_count]
    n = random.choices(n_samples, weights=probabilities, k=1)[0]
    return n

sample_n_train()

In [None]:
def create_task_from_examples(examples, max_attempts=10):
    for _ in range(max_attempts):
        n_train = sample_n_train(len(examples)-1)
        n_test = 1
        if n_train + n_test > len(examples):
            continue
        samples = random.sample(examples, n_train + n_test)
        train_examples = samples[:n_train]
        tokens = sum(len(ex[0])*len(ex[0][0]) + len(ex[1])*len(ex[1][0]) for ex in samples)
        if tokens > 30*30*5:  # max 5 30x30 examples in training
            continue
        test_examples = samples[n_train:]
        task = {'train': [{'input': ex[0], 'output': ex[1]} for ex in train_examples],
                'test': [{'input': ex[0], 'output': ex[1]} for ex in test_examples]}
        return task

In [None]:
dataset = {}
for idx, examples in tqdm(enumerate(df.examples.values), total=len(df)):
    if len(examples) < 3:
        continue
    task = create_task_from_examples(examples)
    if task is not None:
        dataset[f'barc_{idx:06d}'] = task
len(dataset), len(df)

In [None]:
write_json(dataset, '/mnt/hdd0/Kaggle/arc25/data/200k_HEAVY_gpt4o-description-gpt4omini-code_generated_problems/dataset_100k.json.gz')

In [2]:
dataset = load_json('/mnt/hdd0/Kaggle/arc25/data/200k_HEAVY_gpt4o-description-gpt4omini-code_generated_problems/dataset_100k.json.gz')