# Annotation script to annotate collected rollouts

In [1]:
import sys
import os
import json
import yaml
import pathlib
import glob
import pandas as pd
import openai
from pathlib import Path

In [2]:
# CONFIGURE PARAMETERS START --------------
# Define the experiment ID
logs_id = 'demo_test_v3'
experiment_id = 'bt_7_p1_train_pair_test_v3' # Replace with your actual experiment ID
FINETUNE_MODEL = "gpt-3.5-turbo-1106"
SUFFIX = f"bt_7_p1_v3"
FLAG_FILTER = True # if true, filter out episodes with failed steps, if false, keep all demonstrations as is
STARTING_EPISODE = 25
NUM_EPISODES_PER_SCENE = 22
# CONFIGURE PARAMETERS END --------------

# Replace 'experiment_log.json' with the path to your actual JSON file
ROOT_PATH = pathlib.Path("__file__").resolve().parent.parent
EXP_FOLDER = os.path.join(ROOT_PATH, "experiments")

OUTPUT_PATH = os.path.join(EXP_FOLDER, experiment_id)
LOGS_FOLDER = os.path.join(ROOT_PATH, "logs")
CONFIGS_FOLDER = os.path.join(ROOT_PATH, "cos_eor", "configs", "local")
ENVS_FILE_PATH = os.path.join(CONFIGS_FOLDER , "envs_demo.yaml")

# Note: put the OpenAI key here:
with open(os.path.join(CONFIGS_FOLDER, "api_key.yaml")) as kfile:
    k = yaml.safe_load(kfile)
openai.api_key = k['key'] # PUT THE API_KEY into key.txt file
if 'organization' in k:
    openai.organization = k['organization']

TRAIN_FILE_NAME_OUTPUT = "train.jsonl"
VALID_FILE_NAME_OUTPUT = "valid.jsonl"
TEST_FILE_NAME_OUTPUT = "test.jsonl"
META_FILE_NAME = "info.yaml"

# Constants
ANNOTATION = "annotation"
EPISODE = "episode"
DIFF_CORRECT_LOC = "diff_correct_loc"
EXPERIMENT = "experiment"
FLAG = "flag"
FINETUNE_MSG = "finetune_message"
NUM_OBJECTS_DISCOVERED = "num_objects_discovered"
NUM_RECS_DISCOVERED = "num_recs_discovered"
MISSION_COMPLETE = "mission complete"
OUTCOME = "outcome"
PROMPT = "prompt"
REWARD = "reward"
REWARD_WEIGHTS = {NUM_OBJECTS_DISCOVERED: 1, NUM_RECS_DISCOVERED: 1, DIFF_CORRECT_LOC: 10}
SCENE = "scene"
SUC = "succeeded"
FAIL = "failed"
SKIP = "skipped"
SUC_STEPS = "successful_steps"

# Read the scene IDs from the envs.yaml file
with open(ENVS_FILE_PATH , 'r') as file:
    scenes = yaml.safe_load(file).split()

In [3]:
# Load the scene IDs from the envs.yaml file
def load_scenes(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

# Function to get log file paths for a given experiment ID and scene ID
def get_experiment_log_paths(experiment_id, scene_id):
    logs_path_pattern = f'{LOGS_FOLDER}/{experiment_id}/demo/{scene_id}/data_*.json'
    all_paths = sorted(glob.glob(logs_path_pattern), reverse=False)[STARTING_EPISODE:]
    print ('all_paths', all_paths)
    return all_paths

def reward(result):
    reward = sum(result[k] * REWARD_WEIGHTS[k] for k in REWARD_WEIGHTS)
    return reward

def compile_steps(steps):
    # Enumerate over the steps, starting at 1, and format them into a string
    nl = "\n".join(f"step {index}: {step}" for index, step in enumerate(steps, start=1))
    nl_without_prefix = nl.replace('step 1: ', '')
    return nl_without_prefix

def prepare_example_conversation(system_msg, user_msg, assistant_message):
    messages = []
    messages.append({"role": "system", "content": system_msg,})
    messages.append({"role": "user", "content": user_msg})
    messages.append({"role": "assistant", "content": assistant_message})
    return {"messages": messages}

def annotate_record(record, experiment_id, scene_id, episode_id):
    result = {}
    result[SCENE] = scene_id
    result[EXPERIMENT] = experiment_id
    result[EPISODE] = episode_id
    result[NUM_OBJECTS_DISCOVERED] = len(record[OUTCOME]["objects_discovered"])
    result[NUM_RECS_DISCOVERED] = len(record[OUTCOME]["recs_discovered"])
    result[DIFF_CORRECT_LOC] = record[OUTCOME]["count_correct"]["end"] - record[OUTCOME]["count_correct"]["start"]
    # craft a response based on successful steps
    result["all_steps"] = [l['step_raw'] for l in record['logs']]
    result[SUC_STEPS] = [l['step_raw'] for l in record["logs"] if l[FLAG] == SUC]
    system_msg = record["low_level"]["prompt"]["system"]
    user_msg = record["low_level"]["prompt"]["user"] + "\nstep 1: " 
    assistant_msg = compile_steps(result["all_steps"]) # result[SUC_STEPS])
    result[FINETUNE_MSG] = prepare_example_conversation(system_msg=system_msg, user_msg=user_msg, assistant_message=assistant_msg)
    result[REWARD] = reward(result)
    return result

def annotate_episode(episode, experiment_id, scene_id):
    pass


# Function to load experiment logs and add the scene name
def load_and_annotate_logs(experiment_id, scenes):
    all_records = []
    all_episodes = []
    suc_episode_ids = {}
    for scene_id in scenes:
        suc_episode_ids[scene_id] = []
        log_file_paths = get_experiment_log_paths(experiment_id, scene_id)
        for i, log_file_path in enumerate(log_file_paths):
            if len(suc_episode_ids[scene_id]) >= NUM_EPISODES_PER_SCENE:
                break
            episode_id = i + STARTING_EPISODE
            with open(log_file_path, 'r') as file:
                records = json.load(file)
                # Annotate each record with the scene name
                for record in records:
                    record[ANNOTATION] = annotate_record(record, experiment_id, scene_id, episode_id)
                all_records.extend(records)
                # append an episode only if it doesn't contain failed steps
                filter_out_episode = False
                for record in records:
                    failed_logs = [1 for l in record['logs'] if l[FLAG] == FAIL]
                    skipped_logs = [l for l in record['logs'] if l[FLAG] == SKIP and l['step_raw'] != MISSION_COMPLETE]
                    if FLAG_FILTER and len(failed_logs + skipped_logs) > 0:
                        filter_out_episode = True
                        print ('failed log')
                        break
                if not filter_out_episode:
                    suc_episode_ids[scene_id].append(episode_id)
                    all_episodes.append(records)
                # annotated_episode = annotate_episode(records, experiment_id, scene_id)
    return all_records, all_episodes, suc_episode_ids

# Load and annotate logs
annotated_logs, annotated_episodes, suc_episode_ids = load_and_annotate_logs(logs_id, scenes)
print ('successful episodes', suc_episode_ids)


all_paths ['/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-05.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-12.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-22.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-31.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-42.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-49.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-41-56.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-42-10.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-42-17.json', '/workspace/codellmpersonalize/logs/demo_test_v3/demo/pomaria_1_int/data_2026-01-06_08-42

In [4]:
positive_records = [log for log in annotated_logs if log['annotation']['diff_correct_loc'] > 0]
pick_records = [log for log in annotated_logs if log['annotation']]

from collections import Counter
Counter([log[ANNOTATION][SCENE] for log in positive_records])

Counter({'pomaria_1_int': 96})

In [5]:
correct_episode_steps = []
for ie, episode in enumerate(annotated_episodes):
    flag_correct = False
    for ir, record in enumerate(episode):
        if record[ANNOTATION]['diff_correct_loc'] > 0:
            correct_episode_steps.append((ie, ir)) 

In [6]:
from pprint import pprint
print ('Number of correct step episodes', len(correct_episode_steps))
print (correct_episode_steps)

Number of correct step episodes 70
[(0, 1), (0, 2), (0, 3), (1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (2, 3), (2, 4), (3, 1), (3, 2), (3, 3), (4, 1), (4, 2), (4, 3), (5, 1), (6, 1), (6, 2), (7, 1), (7, 2), (7, 3), (7, 4), (8, 1), (8, 2), (8, 3), (9, 1), (9, 2), (9, 3), (10, 1), (10, 2), (11, 1), (11, 2), (12, 1), (12, 2), (12, 3), (12, 4), (13, 1), (13, 2), (13, 3), (14, 1), (14, 2), (14, 3), (15, 1), (15, 2), (16, 1), (16, 2), (16, 3), (16, 4), (16, 5), (17, 1), (17, 2), (17, 3), (17, 4), (18, 1), (18, 2), (18, 3), (18, 4), (18, 5), (19, 1), (19, 2), (19, 3), (19, 4), (20, 1), (20, 2), (20, 3), (20, 4), (21, 1), (21, 2)]


## Demonstration stats

In [7]:
# stats to log: the objects across seeds within the same scene
# success rates of the generated plans
# receptacle counts
from collections import defaultdict
object_stats = defaultdict(list) # scene, seed -> object lists
msgs_counts = defaultdict(list)
gt_correct_counts = defaultdict(list) # successful if all wrong objects cleared
mission_complete_counts = defaultdict(list) # episode ends with mission complete message

correct_counts = defaultdict(list)
for ie, episode in enumerate(annotated_episodes):
    scene = episode[0][ANNOTATION][SCENE]
    objects = list(episode[0]['logs'][0]['current_mapping']['start'].keys())
    msgs_counts[scene].append(len(episode))
    object_stats[scene].append(objects)
    diff = episode[-1][OUTCOME]["count_correct"]['end'] - episode[0][OUTCOME]['count_correct']['start']
    gt_diff = episode[-1][OUTCOME]['count_correct']['end'] + episode[-1][OUTCOME]['count_wrong']['end'] - episode[0][OUTCOME]['count_correct']['start']
    gt_correct_counts[scene].append(gt_diff)
    correct_counts[scene].append(diff)
    last_msg_mission_complete = episode[-1]['logs'][-1]['step_raw'] == MISSION_COMPLETE
    if last_msg_mission_complete:
        mission_complete_counts[scene].append(1)
    else:
        mission_complete_counts[scene].append(0)

In [8]:
# object counts
all_unique_objects = set()
for scene in object_stats:
    print (scene)
    unique_objects = set()
    all_objects_count = []
    for objects in object_stats[scene]:
        unique_objects = unique_objects.union(objects)
        all_unique_objects = all_unique_objects.union(objects)
        all_objects_count.append(len(objects))
    print (len(unique_objects), 'unique objects', sum(all_objects_count)/len(all_objects_count), 'objects per episode on average of ', len(all_objects_count), 'episodes')
print ('all unique object count', len(all_unique_objects))

pomaria_1_int
79 unique objects 8.545454545454545 objects per episode on average of  22 episodes
all unique object count 79


In [9]:
# number of messages counts
total_msgs = 0
for scene in msgs_counts:
    print (scene)
    total_msgs += sum(msgs_counts[scene])
    print ('average', sum(msgs_counts[scene])/len(msgs_counts[scene]), 'messages per episode')
print ('In total', total_msgs, 'messages.')

pomaria_1_int
average 5.181818181818182 messages per episode
In total 114 messages.


In [10]:
# diff rates and success rates (total number of diff)
for scene in correct_counts:
    print (scene)
    print ('[Actual] ~', sum(correct_counts[scene])/len(correct_counts[scene]), 'diff correct objects per episode')
    print ('[Ground Truth] ~', sum(gt_correct_counts[scene])/len(gt_correct_counts[scene]), 'diff correct objects per episode')
    suc_rate = sum(correct_counts[scene])/sum(gt_correct_counts[scene]) * 100
    print ('[Ratio] success rate', f'{suc_rate}%' )
    num_mission_complete = sum(mission_complete_counts[scene])
    num_all_episodes = len(mission_complete_counts[scene])
    mission_complete_rate = num_mission_complete/num_all_episodes * 100
    print (f'[{MISSION_COMPLETE}] {num_mission_complete} out of {num_all_episodes} | rate = {mission_complete_rate}%')    


pomaria_1_int
[Actual] ~ 3.1818181818181817 diff correct objects per episode
[Ground Truth] ~ 3.909090909090909 diff correct objects per episode
[Ratio] success rate 81.3953488372093%
[mission complete] 22 out of 22 | rate = 100.0%


In [11]:
raise Exception('move forward to write messages to jsonl')

Exception: move forward to write messages to jsonl

In [11]:
NUM_TRAIN_EPISODES = 20
NUM_VALID_EPISODES = 2

training_data = []
validation_data = []
episode_counter = {}
for episode in annotated_episodes:
    scene = episode[0][ANNOTATION][SCENE]
    if scene in episode_counter:
        episode_counter[scene] += 1
    else:
        episode_counter[scene] = 1
    if episode_counter[scene] > NUM_TRAIN_EPISODES:
        for log in episode:
            validation_data.append(log[ANNOTATION][FINETUNE_MSG])
    else:
        for log in episode:
            training_data.append(log[ANNOTATION][FINETUNE_MSG])


In [12]:
print (len(training_data))

104


In [13]:
len(validation_data)

10

In [14]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)
            
def write_metadata_file(filepath):
    with open(filepath, 'w') as out:
        flag = {}
        flag['logs_id'] = logs_id
        flag['scenes'] = scenes
        flag['starting_episode'] = STARTING_EPISODE
        flag['num_episodes_per_scene'] = NUM_EPISODES_PER_SCENE
        flag['chosen_episodes_per_scene'] = suc_episode_ids
        flag['num_train_episodes'] = NUM_TRAIN_EPISODES
        flag['num_valid_episodes'] = NUM_VALID_EPISODES
        flag['num_train_samples'] = len(training_data)
        flag['num_valid_samples'] = len(validation_data) 
        yaml.dump(flag, out, default_flow_style=False)
        print (flag)

In [15]:
training_file_name = os.path.join(OUTPUT_PATH, TRAIN_FILE_NAME_OUTPUT)
# Create the parent directory if it doesn't exist
Path(training_file_name).parent.mkdir(parents=True, exist_ok=True)
write_jsonl(training_data, training_file_name)

if NUM_VALID_EPISODES > 0:
    validation_file_name = os.path.join(OUTPUT_PATH, VALID_FILE_NAME_OUTPUT)
    write_jsonl(validation_data, validation_file_name)

write_metadata_file(os.path.join(OUTPUT_PATH, META_FILE_NAME))


{'logs_id': 'demo_test_v3', 'scenes': ['pomaria_1_int'], 'starting_episode': 25, 'num_episodes_per_scene': 22, 'chosen_episodes_per_scene': {'pomaria_1_int': [26, 27, 28, 30, 31, 34, 35, 36, 37, 39, 40, 41, 42, 45, 47, 48, 49, 50, 52, 53, 54, 56]}, 'num_train_episodes': 20, 'num_valid_episodes': 2, 'num_train_samples': 104, 'num_valid_samples': 10}


In [17]:
raise Exception('move forward to create finetune jobs')

Exception: move forward to create finetune jobs

In [16]:
def create_finetune_upload_response():
    training_response = openai.File.create(file=open(training_file_name, "rb"), purpose="fine-tune")
    training_file_id = training_response["id"]
    print("Training file ID:", training_file_id)
    if NUM_VALID_EPISODES > 0:
        validation_response = openai.File.create(file=open(validation_file_name, "rb"), purpose="fine-tune")
        validation_file_id = validation_response["id"]
        print("Validation file ID:", validation_file_id)
        return {"training_response": training_response, "validation_response": validation_response}
    else:
        return {"training_response": training_response}


def create_finetune_response_and_log(training_file_id, validation_file_id=None):
    if validation_file_id is not None:
        response = openai.FineTuningJob.create( training_file=training_file_id, validation_file=validation_file_id, model=FINETUNE_MODEL, suffix=SUFFIX, hyperparameters={"n_epochs":1})
    else:
        response = openai.FineTuningJob.create( training_file=training_file_id, model=FINETUNE_MODEL, suffix=SUFFIX, hyperparameters={"n_epochs":1})
    job_id = response["id"]
    with open(os.path.join(OUTPUT_PATH, "job_info.txt"), 'w') as out:
        flag = ""
        flag += f"training file id: {training_file_id}\n"
        if validation_file_id is not None:
            flag += f"validation file id: {validation_file_id}\n"
        flag += f"finetune job id: {job_id}\n"
        flag += f"finetune model: {FINETUNE_MODEL}\n"
        flag += f"finetune suffix: {SUFFIX}"
        out.write(flag)
        print("Status:", response["status"])
        print("Job ID:", response["id"])
    return response

In [17]:
upload_result = create_finetune_upload_response()
training_response = upload_result['training_response']
validation_file_id = None
training_file_id = training_response['id']
if NUM_VALID_EPISODES > 0:
    validation_response = upload_result['validation_response']
    validation_file_id = validation_response['id']
finetune_response = create_finetune_response_and_log(training_file_id, validation_file_id)
job_id = finetune_response['id']

Training file ID: file-SehoY1bk5REz4sypdMHFAv
Validation file ID: file-D34BQdwdpdryRA5tDWK1nw
Status: validating_files
Job ID: ftjob-GrqKUBulEtCOE0aAJvwbiHO3


In [18]:
response = openai.FineTuningJob.retrieve(job_id)

print("Job ID:", response["id"])
print("Status:", response["status"])
print("Trained Tokens:", response["trained_tokens"])

Job ID: ftjob-GrqKUBulEtCOE0aAJvwbiHO3
Status: validating_files
Trained Tokens: None
