In [1]:
from huggingface_hub import hf_hub_download
import pandas as pd
from datasets import Dataset, DatasetDict

dataset_name = "concept500"

# Download both splits
train_path = hf_hub_download(
    repo_id=f"pyvene/axbench-{dataset_name}",
    filename="2b/l10/train/data.parquet",
    repo_type="dataset",
)

test_path = hf_hub_download(
    repo_id=f"pyvene/axbench-{dataset_name}",
    filename="2b/l10/test/data.parquet",
    repo_type="dataset",
)

# Read and convert to HF dataset
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(pd.read_parquet(train_path)),
        "test": Dataset.from_pandas(pd.read_parquet(test_path)),
    }
)

In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "parquet",
    data_files="/workspace/HyperDAS/axbench/axbench/concept16k/prod_2b_l20_v1/generate/train_data.parquet",
)


Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'output_concept', 'concept_genre', 'category', 'dataset_category', 'concept_id'],
        num_rows: 36216
    })
})

In [3]:
dataset["train"][0]

{'input': 'Why do some people select private schools over public schools?',
 'output': 'There are many reasons why some people choose private schools over public schools. Here are some of the most common:\n\n**Academic Focus and Curriculum:**\n\n* **Smaller class sizes:** Private schools often have smaller class sizes, allowing for more individualized attention from teachers.\n* **Specialized programs:** Some private schools offer specialized programs in areas like STEM, arts, or foreign languages, which may not be available in public schools.\n* **Rigorous academics:** Private schools often have a more rigorous academic curriculum, preparing students for college and advanced studies.\n* **Emphasis on specific values:** Some private schools emphasize specific values, such as religious faith, classical education, or a particular philosophy of learning.\n\n**Extracurricular Activities and Resources:**\n\n* **Extensive extracurricular activities:** Private schools often offer a wider rang

In [7]:
train_path

'/root/.cache/huggingface/hub/datasets--pyvene--axbench-concept500/snapshots/ad8a5d60c4616b599c24dd6689f05f696ec610f3/2b/l10/train/data.parquet'

In [5]:
df = pd.DataFrame(dataset["train"])

In [14]:
groups = df.groupby("concept_id")

for x, g in groups:
    print(type(x), type(g))

<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'> <class 'pandas.core.frame.DataFrame'>
<class 'int'

In [6]:
df["concept_id"].unique()

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17

In [None]:
dataset.save_to_disk(f"./data/{dataset_name}_2b_l10")

Saving the dataset (0/1 shards):   0%|          | 0/36216 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/37958 [00:00<?, ? examples/s]

In [6]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("pyvene/axbench-concept16k_v2", split="train")

In [7]:
ds

Dataset({
    features: ['input', 'output', 'output_concept', 'concept_genre', 'category', 'dataset_category', 'concept_id'],
    num_rows: 3456648
})

In [12]:
# Get counts of concept_id = -1 vs other concept_ids
concept_ids = pd.Series(ds["concept_id"])
no_concept_count = len(concept_ids[concept_ids == -1])
with_concept_count = len(concept_ids[concept_ids >= 0])

print(f"\nNumber of samples with concept_id = -1: {no_concept_count}")
print(f"Number of samples with concept_id >= 0: {with_concept_count}")



Number of samples with concept_id = -1: 648
Number of samples with concept_id >= 0: 3456000
