In [1]:
from IPython.display import clear_output

In [2]:
%pip install datasets pandas huggingface_hub

clear_output()

In [3]:
!unzip evals.zip

clear_output()

# Is the same dataset used for eval across all checkpoints? 

## Are the prompts the same across all eval steps? 

In [4]:
from datasets import load_from_disk

steps = [100, 200, 300, 400, 500, 600]

In [5]:
prompts_per_step = []

for step in steps:
    data = load_from_disk(f"evals/step_{step}/alphabet-sort")
    prompts = data["prompt"]
    assert len(prompts) == 2048, "There should be 2048 prompts, each"
    prompts_per_step.append(prompts)

In [6]:
# Check if all prompts are the same
all_equal = all(prompts_per_step[0] == p for p in prompts_per_step)
print("Same prompt across all steps?" , all_equal)

Same prompt across all steps? True


## Answer: Yes

## Are infos the same? 

In [7]:
infos_per_step = []

for step in steps:
    data = load_from_disk(f"evals/step_{step}/alphabet-sort")
    infos = data["info"]
    assert len(infos) == 2048, "There should be 2048 infos, each"
    infos_per_step.append(infos)

In [8]:
# Check if all infos are the same
all_equal = all(infos_per_step[0] == p for p in infos_per_step)
print("Same info col across all steps?" , all_equal)

Same info col across all steps? True


## Answer: Yes

## Are prompts + infos the same? 

In [9]:
joint_per_step = []

for step in steps:
    data = load_from_disk(f"evals/step_{step}/alphabet-sort")
    joint_data = [row["prompt"][0]["content"][0]["text"] + row["info"] for row in data]
    assert len(joint_data) == 2048, "there should be 2048 rows"
    joint_per_step.append(joint_data)

In [10]:
# Check if all joint infos are the same
all_equal = all(joint_per_step[0] == p for p in joint_per_step)
print("Same joint row across all steps?" , all_equal)

Same joint row across all steps? True


### Yes

# Save eval data to HF

In [11]:
import pandas as pd

dataset = load_from_disk("evals/step_100/alphabet-sort")

In [12]:
prompts, full_rollouts = [], []

for row in dataset:
    prompt = row["prompt"][0]["content"][0]["text"]
    completions = [entry["content"] for entry in row["completion"]]

    assert isinstance(prompt, str)
    assert isinstance(completions, list)

    multiturn_prompts = prompt + " ".join(completions[1::2])
    entire_rollout = prompt + " ".join(completions)
    
    prompts.append(multiturn_prompts)
    full_rollouts.append(entire_rollout)


df = pd.DataFrame({'prompt': prompts, 'full_rollout': full_rollouts})
df.to_csv('data.csv', index=False)

In [13]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo("israel-adewuyi/eval_data_alphabet_sort", repo_type="dataset")

from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="data.csv",
    path_in_repo="data.csv",
    repo_id="israel-adewuyi/eval_data_alphabet_sort",
    repo_type="dataset"
)

CommitInfo(commit_url='https://huggingface.co/datasets/israel-adewuyi/eval_data_alphabet_sort/commit/5c1c9e61dce65d40af7e1c4eae74c3c1057536f7', commit_message='Upload data.csv with huggingface_hub', commit_description='', oid='5c1c9e61dce65d40af7e1c4eae74c3c1057536f7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/israel-adewuyi/eval_data_alphabet_sort', endpoint='https://huggingface.co', repo_type='dataset', repo_id='israel-adewuyi/eval_data_alphabet_sort'), pr_revision=None, pr_num=None)