In [1]:
import os

current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")
os.chdir("/workspace")
print(f"Working directory changed to: {os.getcwd()}")
from datasets import load_dataset, load_from_disk
import verifiers as vf
from verifiers.envs.bfcl_inthinking_env import preprocess_bfcl_dataset, BFCL_INTHINKING_USER_PROMPT
import json
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

Current working directory: /workspace/notebooks
Working directory changed to: /workspace
[2025-04-19 19:21:50,460] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
testds=preprocess_bfcl_dataset(curriculum_learning=False, split="train")
testds

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'question', 'initial_config', 'path', 'involved_classes', 'num_turns', 'answer', 'prompt', 'user_question_bank', 'ground_truth_bank'],
    num_rows: 100
})

In [4]:
import copy
import gc
import importlib
import inspect
import json
import time
from typing import Any, Callable, Dict, List, Sequence, Tuple, Union

import numpy as np
import torch # Added for tokenization
from datasets import (
    Dataset,
    DatasetDict,  # type: ignore
)
from huanzhi_utils import load_file
from loguru import logger
from sklearn.model_selection import train_test_split
from trl.trainer.grpo_trainer import RewardFunc
from transformers import PreTrainedTokenizerBase # Added for tokenization

from verifiers.envs.multistep_env import MultiStepEnv
from verifiers.envs.tool_env import infer_schema_from_function
from verifiers.parsers import XMLParser
from verifiers.rubrics.bfcl_inthinking_rubric import BfclITRubric
from verifiers.tools.bfcl_tools import (
    INVOLVED_CLASS_TO_FUNC_DOC_PATH,
    construct_tools_from_involved_classes,
)

from verifiers.imports import LLM, SamplingParams  # type: ignore

In [6]:
def format_bfcl_prompt(
    involved_classes: List[str] | None = None,
    user_question: str | None = None,
) -> List[Dict[str, str]]:
    messages = []
    tools = construct_tools_from_involved_classes(involved_classes)
    # Combine instructions, tools, and query into the first user message
    messages.append(
        {
            "role": "user",
            "content": BFCL_INTHINKING_USER_PROMPT.format(
                tools=tools, user_query=user_question
            ),
        }
    )
    return messages


In [8]:
curriculum_learning=False
multi_turn_base_data = load_file(
    "verifiers/berkeley-function-call-leaderboard/data/BFCL_v3_multi_turn_base.json"
)
multi_turn_base_answer = load_file(
    "verifiers/berkeley-function-call-leaderboard/data/possible_answer/BFCL_v3_multi_turn_base.json"
)

# Reprocess the columns into serializable format and add num_turns
processed_data = []
for i in range(len(multi_turn_base_data)):
    entry_data = multi_turn_base_data[i]
    entry_answer = multi_turn_base_answer[i]

    # --- Handle Multi-Turn Tasks ---
    # If a task has multiple user turns, create separate data points for each turn.
    # Each data point will contain only the *first* user question for that turn's context.
    # The ground truth will correspond to the actions needed for *that specific turn*.
    questions = entry_data["question"]
    ground_truths = entry_answer["ground_truth"]
    initial_config = entry_data["initial_config"]
    involved_classes = entry_data["involved_classes"]
    entry_id = entry_data["id"]

    assert len(questions) == len(ground_truths), (
        f"Mismatch in number of turns for entry {entry_id}"
    )

    # Create one entry per original turn
    for turn_idx in range(len(questions)):
        # The prompt only contains the *first* user question of the *current* turn
        current_user_question = questions[turn_idx][0]["content"]
        # The ground truth is only for the *current* turn
        current_ground_truth = [ground_truths[turn_idx]] # Wrap in list for consistency

        processed_entry = {
            "id": f"{entry_id}_turn_{turn_idx}", # Unique ID per turn
            "involved_classes": involved_classes,
            "initial_config": json.dumps(initial_config), # Keep original initial config
            "prompt": format_bfcl_prompt(
                involved_classes=involved_classes,
                user_question=current_user_question,
            ),
            # Ground truth now only contains the answer for the *current* turn's request
            "answer": json.dumps(current_ground_truth),
            # Store the original full answer for potential complex reward calculation later if needed
            "original_full_answer": json.dumps(ground_truths),
            "num_total_turns_in_task": len(questions), # Original number of turns in the task
            "current_turn_index": turn_idx, # Index of this turn within the original task
            # These are no longer needed as we process one turn at a time
            # "user_question_bank": "[]",
            # "ground_truth_bank": "[]",
        }
        processed_data.append(processed_entry)

        # If curriculum learning, potentially create sub-entries (though less relevant with single-turn interaction)
        # For now, curriculum learning based on num_total_turns_in_task
        if curriculum_learning:
                processed_entry["num_turns"] = processed_entry["num_total_turns_in_task"] # Use total task turns for curriculum sorting


if not curriculum_learning:
    # Add num_turns if not using curriculum learning (can just be 1 or based on original task)
        for entry in processed_data:
            entry["num_turns"] = entry["num_total_turns_in_task"] # Or set to 1 if preferred


dataset = Dataset.from_list(processed_data)

In [9]:
dataset

Dataset({
    features: ['id', 'involved_classes', 'initial_config', 'prompt', 'answer', 'original_full_answer', 'num_total_turns_in_task', 'current_turn_index', 'num_turns'],
    num_rows: 745
})

In [11]:
# Get unique IDs and split those first
unique_ids = sorted(list(set(dataset["id"])))
train_ids, test_ids = train_test_split(unique_ids, test_size=0.5, random_state=42)

# Filter dataset based on IDs
train_dataset = dataset.filter(lambda x: x["id"] in train_ids)
test_dataset = dataset.filter(lambda x: x["id"] in test_ids)

if curriculum_learning:
    # Sort both splits by num_turns while preserving randomization within same num_turns
    def sort_by_turns(split):
        df = split.to_pandas()
        # Set seed for reproducibility
        rng = np.random.RandomState(42)
        # Randomize order within same num_turns by adding small random values
        df["sort_key"] = df["num_turns"] + rng.random(len(df)) * 0.1
        df = df.sort_values("sort_key")
        df = df.drop("sort_key", axis=1)
        return Dataset.from_pandas(df)

    train_dataset = sort_by_turns(train_dataset)
    test_dataset = sort_by_turns(test_dataset)

dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# assert train_dataset and test_dataset have non-overlapping ids
assert len(set(train_dataset["id"]) & set(test_dataset["id"])) == 0, (
    "Train and test datasets have overlapping ids"
)

Filter:   0%|          | 0/745 [00:00<?, ? examples/s]

Filter:   0%|          | 0/745 [00:00<?, ? examples/s]

In [12]:
train_dataset

Dataset({
    features: ['id', 'involved_classes', 'initial_config', 'prompt', 'answer', 'original_full_answer', 'num_total_turns_in_task', 'current_turn_index', 'num_turns'],
    num_rows: 372
})

In [None]:
effective_max_turns = 1
if effective_max_turns > 0:
        logger.info(f"Filtering train dataset to max {effective_max_turns} total turns in task.")
        self.dataset = self.dataset.filter(
            lambda x: x["num_total_turns_in_task"] <= effective_max_turns
        )

In [14]:
dataset.filter(
            lambda x: x["num_total_turns_in_task"] <= 1
        )

Filter:   0%|          | 0/745 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'involved_classes', 'initial_config', 'prompt', 'answer', 'original_full_answer', 'num_total_turns_in_task', 'current_turn_index', 'num_turns'],
    num_rows: 3
})

In [13]:
test_dataset

Dataset({
    features: ['id', 'involved_classes', 'initial_config', 'prompt', 'answer', 'original_full_answer', 'num_total_turns_in_task', 'current_turn_index', 'num_turns'],
    num_rows: 373
})

In [15]:
from verifiers.utils.data_utils import preprocess_bfcl_dataset

In [17]:
test2=preprocess_bfcl_dataset(curriculum_learning=False)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

In [18]:
test2

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'initial_config', 'path', 'involved_classes', 'num_turns', 'answer', 'prompt', 'user_question_bank', 'ground_truth_bank'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'question', 'initial_config', 'path', 'involved_classes', 'num_turns', 'answer', 'prompt', 'user_question_bank', 'ground_truth_bank'],
        num_rows: 100
    })
})

In [10]:

# --- Splitting ---
# Get unique original IDs and split those first to keep turns from the same task together
unique_original_ids = sorted(list(set(d["id"].split('_turn_')[0] for d in processed_data)))
len(unique_original_ids)

1

In [None]:

train_test_split(unique_original_ids, test_size=0.5, random_state=42)


In [None]:

# Filter dataset based on original IDs
train_dataset = dataset.filter(lambda x: x["id"].split('_turn_')[0] in train_ids)
test_dataset = dataset.filter(lambda x: x["id"].split('_turn_')[0] in test_ids)

if curriculum_learning:
    # Sort both splits by num_turns (total task turns)
    def sort_by_turns(split_ds):
        df = split_ds.to_pandas()
        rng = np.random.RandomState(42)
        df["sort_key"] = df["num_turns"] + rng.random(len(df)) * 0.1
        df = df.sort_values("sort_key")
        df = df.drop("sort_key", axis=1)
        return Dataset.from_pandas(df)

    train_dataset = sort_by_turns(train_dataset)
    test_dataset = sort_by_turns(test_dataset)

dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# assert train_dataset and test_dataset have non-overlapping original ids
train_original_ids = set(x["id"].split('_turn_')[0] for x in train_dataset)
test_original_ids = set(x["id"].split('_turn_')[0] for x in test_dataset)
assert len(train_original_ids & test_original_ids) == 0, (
    "Train and test datasets have overlapping original task ids"
)
logger.info(f"Preprocessed dataset for split '{split}'. Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")