In [None]:
import getpass
import os
import pandas as pd
import numpy as np
import json
import tqdm
import random
    
import google.generativeai as genai
import textwrap

genai.configure(api_key="YOUR API KEY")
gen_config = genai.types.GenerationConfig(temperature=0.0)

In [None]:
from IPython.display import display
from IPython.display import Markdown
pd.set_option('display.max_colwidth', None) ##


def to_markdown(text):
    text = text.replace('â€¢', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash-latest', generation_config=gen_config)

In [None]:
real_robot_data_dir = "/Users/kennethshui/Desktop/llm_social_nav_transformed"
npys = []

for bag_dir in os.listdir(real_robot_data_dir):
    bag_dir_path = os.path.join(real_robot_data_dir, bag_dir)
    if os.path.isdir(bag_dir_path):
        npys_dir = os.path.join(bag_dir_path, "npys")
        for npy_file in os.listdir(npys_dir):
            npy_file_path = os.path.join(npys_dir, npy_file)
            npys.append(npy_file_path)

npys[:10]
len(npys)

In [None]:
timesteps = 40

df_rows = []
for npy_file in npys:
    sample_line = {}
    example = np.load(npy_file, allow_pickle=True).item()
    x = example['X']
    y = example['y']
    sample_line['id'] = npy_file
    sample_line['participant'] = npy_file.split('/')[-3]
    sample_line['goal'] = np.around(np.array(x['goal']), decimals=4)[-timesteps:, :2][::5]
    sample_line['robot'] = np.around(np.array(example['robot']), decimals=4)[-timesteps:][::5]

    if np.all(sample_line['goal'] == sample_line['goal'][0, :], axis=0).all() and np.all(sample_line['robot'] == sample_line['robot'][0, :], axis=0).all():
        continue

    nearby_array = np.around(np.array(x['nearby']), decimals=4)[-timesteps:][::5]
    nearby_array = np.transpose(nearby_array, (1, 0, 2))

    nearby_array_2d = nearby_array.reshape(nearby_array.shape[0], -1)
    nearby_unique_rows, indices = np.unique(nearby_array_2d, axis=0, return_index=True)
    nearby_array = nearby_unique_rows.reshape(-1, nearby_array.shape[1], nearby_array.shape[2])

    ## change this so that it only checks if the first two indices == 0, rather than all indices
    non_zero_nearby_array = []
    # Iterate over the first dimension
    for i in range(nearby_array.shape[0]):
        # Check if the sub-array (40, 4) is all-zero
        if not np.any(nearby_array[i] == 0):
            non_zero_nearby_array.append(nearby_array[i])

    non_zero_nearby_array = np.array(non_zero_nearby_array)
    
    sample_line['follower'] = np.around(np.array(x['follower']), decimals=4)[-timesteps:][::5]
    sample_line['nearby'] = non_zero_nearby_array
    sample_line['competence'] = y[0] + 1
    sample_line['surprise'] = y[1] + 1
    sample_line['intention'] = y[2] + 1
    df_rows.append(sample_line)

random.shuffle(df_rows)
full_df = pd.DataFrame(df_rows)
full_df.head()
print(full_df.shape)
print(len(full_df.participant.unique()))


In [81]:
import random

# def get_examples(df, 
#                  positive_competence_count = 1, negative_competence_count = 1,
#                  positive_surprise_count = 1, negative_surprise_count = 1,
#                  positive_intention_count = 1, negative_intention_count = 1):
#     # Filter samples based on the conditions
#     competence_positive_idx = df[df['competence'].isin([3, 4, 5])].sample(n=positive_competence_count).index
#     competence_negative_idx = df[df['competence'].isin([1, 2])].sample(n=negative_competence_count).index

#     surprise_positive_idx = df[df['surprise'].isin([4, 5])].sample(n=positive_surprise_count).index
#     surprise_negative_idx = df[df['surprise'].isin([1, 2, 3])].sample(n=negative_surprise_count).index

#     intention_positive_idx = df[df['intention'].isin([3, 4, 5])].sample(n=positive_intention_count).index
#     intention_negative_idx = df[df['intention'].isin([1, 2])].sample(n=negative_intention_count).index

#     # Combine the filtered samples into one DataFrame
#     selected_indices = pd.Index(list(set(competence_positive_idx) | set(competence_negative_idx) |
#                                      set(surprise_positive_idx) | set(surprise_negative_idx) |
#                                      set(intention_positive_idx) | set(intention_negative_idx)))

#     # Create a DataFrame excluding the selected samples
#     selected_samples = df.loc[selected_indices.unique()]
#     remaining_samples = df.drop(selected_indices.unique())

#     return selected_samples, remaining_samples

def get_examples(df, count):
    selected_samples = df.sample(n=count)
    remaining_samples = df.drop(selected_samples.index)

    return selected_samples, remaining_samples
    # return df, None

selected_samples, remaining_samples = get_examples(full_df, 80)
selected_samples.head()

### The below code is changing [cos(theta), sin(theta)] to [theta]. 
# Function to process each row
# new_follower_column = selected_samples[['follower']]

# # Create a new DataFrame with the new column
# new_selected_samples = selected_samples.drop(columns=['follower'])  # Drop the original column
# new_selected_samples[['follower']] = selected_samples.apply(process_row, axis=1)  # Add new columns
# new_selected_samples = selected_samples.drop(columns=['nearby'])  # Drop the original column
# new_selected_samples[['nearby']] = selected_samples.apply(process_row2, axis=1)  # Add new columns

# # Display the new DataFrame
# print(new_selected_samples.head())

# import math
# def get_angle(cs, sn):
#    angle2 = math.atan2(sn, cs)
#    if angle2 < 0: angle2 += 2*math.pi
#    return angle2

Unnamed: 0,id,participant,goal,robot,follower,nearby,competence,surprise,intention
19,/Users/kennethshui/Desktop/llm_social_nav_transformed/1713302818.578851/npys/1713302912186915888_0.npy,1713302818.578851,"[[-3.5368, -2.3], [-3.3565, -2.1261], [-3.1778, -1.9522], [-2.9953, -1.7827], [-2.7986, -1.6295], [-2.6118, -1.466], [-2.4193, -1.3091], [-2.2193, -1.1584]]","[[-0.6708, -0.7417], [-0.7246, -0.6891], [-0.6914, -0.7225], [-0.7743, -0.6328], [-0.7989, -0.6014], [-0.7435, -0.6688], [-0.8018, -0.5976], [-0.794, -0.6079]]","[[2.0023, 0.762, -0.911, -0.4124], [2.024, 1.0013, 0.5274, -0.8496], [2.2159, 1.1049, 0.7505, -0.6609], [2.4708, 1.1984, -0.162, -0.9868], [2.6602, 1.5478, 0.9999, 0.0102], [2.7454, 1.6147, -0.424, 0.9056], [3.0273, 1.8369, 0.7902, 0.6128], [3.1133, 1.8274, -0.8815, -0.4721]]","[[[2.0023, 0.762, -0.911, -0.4124], [2.024, 1.0013, 0.5274, -0.8496], [2.2159, 1.1049, 0.7505, -0.6609], [2.4708, 1.1984, -0.162, -0.9868], [2.6602, 1.5478, 0.9999, 0.0102], [2.7454, 1.6147, -0.424, 0.9056], [3.0273, 1.8369, 0.7902, 0.6128], [3.1133, 1.8274, -0.8815, -0.4721]]]",5,2,4
71,/Users/kennethshui/Desktop/llm_social_nav_transformed/1713301481.745976/npys/1713301532725183370_0.npy,1713301481.745976,"[[-10.2479, -2.9735], [-10.4518, -2.8337], [-10.5116, -2.7927], [-10.5113, -2.7937], [-10.5116, -2.7927], [-10.5097, -2.7902], [-10.5097, -2.7902], [-10.5097, -2.7902]]","[[0.8206, -0.5715], [0.8304, -0.5571], [0.8758, -0.4826], [0.7564, -0.6542], [0.5467, -0.8373], [0.3213, -0.947], [0.0129, -0.9999], [-0.2967, -0.955]]","[[-1.4732, 0.8969, 0.5982, -0.8014], [-1.4009, 1.0793, 0.0563, -0.9984], [-1.5881, 1.1268, 0.6234, -0.7819], [-1.1889, 0.9661, 0.9951, -0.0985], [-0.9744, 1.1173, 0.9532, 0.3022], [-0.5935, 1.1042, 0.9826, -0.1857], [-0.1539, 1.4456, 0.9307, -0.3659], [0.1458, 1.3292, 0.1863, -0.9825]]","[[[-1.4732, 0.8969, 0.5982, -0.8014], [-1.4009, 1.0793, 0.0563, -0.9984], [-1.5881, 1.1268, 0.6234, -0.7819], [-1.1889, 0.9661, 0.9951, -0.0985], [-0.9744, 1.1173, 0.9532, 0.3022], [-0.5935, 1.1042, 0.9826, -0.1857], [-0.1539, 1.4456, 0.9307, -0.3659], [0.1458, 1.3292, 0.1863, -0.9825]]]",4,3,4
97,/Users/kennethshui/Desktop/llm_social_nav_transformed/1715357694.9017513/npys/1715357841650498721_0.npy,1715357694.9017513,"[[-2.3393, 0.1124], [-2.331, 0.1122], [-2.331, 0.1122], [-2.3342, 0.1178], [-2.3335, 0.1165], [-2.3342, 0.1178], [-2.3319, 0.1177], [-2.3309, 0.1174]]","[[-0.9637, 0.267], [-0.9663, -0.2576], [-0.7322, -0.6811], [-0.3096, -0.9509], [0.0865, -0.9962], [0.5587, -0.8294], [0.8902, -0.4556], [0.9987, 0.0506]]","[[0.7587, -0.368, -0.7925, 0.6099], [0.9236, -0.0109, -0.9875, -0.1575], [0.9428, 0.4219, -0.6801, -0.7331], [0.6446, 0.8452, -0.9958, -0.0917], [0.1876, 1.088, -0.1372, -0.9905], [-0.0102, 0.9097, 0.1595, 0.9872], [-0.5588, 0.5457, 0.7477, -0.6641], [-0.9869, 0.1941, 0.8269, -0.5624]]","[[[0.7587, -0.368, -0.7925, 0.6099], [0.9236, -0.0109, -0.9875, -0.1575], [0.9428, 0.4219, -0.6801, -0.7331], [0.6446, 0.8452, -0.9958, -0.0917], [0.1876, 1.088, -0.1372, -0.9905], [-0.0102, 0.9097, 0.1595, 0.9872], [-0.5588, 0.5457, 0.7477, -0.6641], [-0.9869, 0.1941, 0.8269, -0.5624]]]",3,3,1
86,/Users/kennethshui/Desktop/llm_social_nav_transformed/1713541727.5521238/npys/1713541735458687227_0.npy,1713541727.5521238,"[[-9.9585, 0.6739], [-9.9417, 0.6763], [-9.9409, 0.6758], [-9.9635, 0.6771], [-9.9639, 0.6785], [-9.9616, 0.6799], [-9.9577, 0.6824], [-9.9577, 0.6824]]","[[0.6886, 0.7251], [0.2527, 0.9676], [-0.2481, 0.9687], [-0.6775, 0.7355], [-0.9491, 0.3149], [-0.9832, -0.1823], [-0.7744, -0.6327], [-0.3706, -0.9288]]","[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]",[],1,5,1
15,/Users/kennethshui/Desktop/llm_social_nav_transformed/1713543095.55775/npys/1713543279630978693_0.npy,1713543095.55775,"[[-2.3106, 0.8883], [-2.5799, 0.8793], [-2.8246, 0.837], [-3.0742, 0.8114], [-3.3225, 0.8296], [-3.5262, 0.9012], [-3.5608, 0.9158], [-3.5608, 0.9158]]","[[1.0, -0.0081], [0.9936, 0.1127], [0.9764, 0.216], [0.9993, 0.0371], [0.9786, -0.2058], [0.9131, -0.4078], [0.9779, -0.2091], [0.9976, 0.0686]]","[[-0.9798, 0.0411, 0.7652, 0.6438], [-0.8929, -0.0977, 0.8643, 0.503], [-1.0318, -0.0481, 0.5982, 0.8014], [-1.0498, -0.077, 0.2668, 0.9638], [-0.9936, 0.0107, 0.962, -0.273], [-0.8993, 0.2793, 0.9937, 0.1122], [-0.838, 0.0299, 0.9778, -0.2097], [-0.9076, 0.0357, -0.9136, 0.4067]]","[[[-0.9798, 0.0411, 0.7652, 0.6438], [-0.8929, -0.0977, 0.8643, 0.503], [-1.0318, -0.0481, 0.5982, 0.8014], [-1.0498, -0.077, 0.2668, 0.9638], [-0.9936, 0.0107, 0.962, -0.273], [-0.8993, 0.2793, 0.9937, 0.1122], [-0.838, 0.0299, 0.9778, -0.2097], [-0.9076, 0.0357, -0.9136, 0.4067]], [[2.8452, -0.4554, 0.9572, 0.2893], [2.0136, 0.1111, -0.8923, -0.4514], [2.462, 0.1037, -0.9984, -0.056], [2.7746, -0.2511, -0.5617, 0.8273], [2.6918, -0.6705, -0.9576, 0.2883], [2.7306, -0.2285, 0.6856, 0.728], [2.5979, -0.3113, -0.4316, -0.9021], [2.6229, -0.2093, -0.6739, -0.7389]]]",4,5,2


In [82]:
def parse_query(row):
    query = ""
    query += "Destination (8 timesteps, 2 features): " + np.array2string(np.around(np.array(row[1].goal), decimals=4), separator=', ', max_line_width=np.inf).replace('\n', '') + "\n"
    for i, nearby in enumerate(row[1].nearby):
        query += "Nearby pedestrian " + str(i+1) + " (8 timesteps, 4 features): " + np.array2string(np.around(np.array(nearby), decimals=4), separator=', ', max_line_width=np.inf).replace('\n', '') + "\n"
    if np.all(row[1].follower == 0):
        query += "Human Follower: Not detected\n"
    else:
        query += "Human Follower (8 timesteps, 4 features): " + np.array2string(np.around(np.array(row[1].follower), decimals=4), separator=', ', max_line_width=np.inf).replace('\n', '') + "\n"
    query += "Robot Orientation (8 timesteps, 2 features): " + np.array2string(np.around(np.array(row[1].robot), decimals=4), separator=', ', max_line_width=np.inf).replace('\n', '') + "\n"
    return query

def parse_examples(rows):
    examples = ""
    for row in rows.iterrows():
        example = ""
        example += parse_query(row)
        example += "Grouth-Truth Ratings of Robot Performance in the time window: " + str({"competence": row[1].competence, "surprise": row[1].surprise, "intention": row[1].intention})
        examples += example + "\n\n"
    
    return examples

text_examples = parse_examples(selected_samples)

In [83]:
def build_prompt(general_examples, query):
    general_text_examples = parse_examples(general_examples)

    prompt = textwrap.dedent(f"""\
        A human followed a mobile robot while they navigated to a particular destination in an indoor environment.
        During navigation, the human rated the robot's performance by answering three questions:
        The first question is: 'How competent was the robot at navigating?'
        The second question is: 'How surprising was the robot's navigation behavior?' and
        The third question is: 'How clear were the robot's intentions during navigation?'
        The human provided ratings for each of the questions as an integer between 1 and 5, where 1 was the lowest score and 5 was the highest.
        For example, for the first question about competence, the ratings could be: 1 for 'incompetent', 2 for 'somewhat incompetent', 3 for 'neither competent nor incompetent', 4 for 'somewhat competent', and 5 for 'competent'.
        For the second question about surprising navigation behavior, the ratings could be: 1 for 'unsurprising', 2 for 'somewhat unsurprising', 3 for 'neither unsurprising nor surprising', 4 for 'somewhat surprising', and 5 for 'surprising'.
        Finally, for the third question about intention, the ratings could be: 1 for 'unclear intention', 2 for 'somewhat unclear intention', 3 for 'neither unclear nor clear intention', 4 for 'somewhat clear intention', and 5 for 'clear intention'.
        
        The robot has two Kinect cameras that can detect the poses of the people nearby in the robot coordinate system.
        It can also detect the position of the destination relative to the robot. Below are the specifications of each type of feature detected at every timestep:
        destination_position (x, y): the 2D position of the destination, relative to the robot's coordinate system.
        follower_pose (x, y, cos(theta), sin(theta)): the position (x, y) and cos-sin encoding of orientation (cos(theta), sin(theta)) of the participant who followed the robot, relative to the robot's coordinate system. If the follower is not detected at a timestep, all four values will be 0.
        nearby_people_pose, a list of (x, y, cos(theta), sin(theta)): a list of positions (x, y) and cos-sin encodings of orientations (cos(theta), sin(theta)) of the nearby people around the robot, relative to the robot's coordinate system.
        robot orientation (cos(theta), sin(theta)): the cos-sin encoding of the robot's orientation (cos(theta), sin(theta)) in the world coordinate system. All other features are in the robot coordinate system so we don't need to provide the robot's position.
        
        Each data example is a time window of 8 seconds, recorded at 1 hz, and resulting in 8 time steps. \n""")

    if len(general_examples) > 0:
        prompt += textwrap.dedent(f"""\
            Previously, we have gathered several data examples from different humans participating in the navigation task, which are given below and shuffled. For each example, you can see the ground-truth ratings that the human gave to the robot in terms of the three questions about robot competence, surprising navigation behavior, and intention:\n\n{general_text_examples}""")
    
    prompt += textwrap.dedent(f"""\
        In the navigation task, the robot had three types of behaviors:
        The first type of behavior is moving towards the destination: usually the absolute values of the x or y of the destination (relative to the robot) decrease over time (get closer to 0),
        The second type of behavior is rotating at a fixed position: usually causes the x and y of the destination (relative to the robot) to be almost unchanged, and the cos-sin encoding of the robot's orientation (cos(theta), sin(theta)) to change drastically over time, and
        The third type of behavior is moving in a wrong direction away from the destination: usually causes the absolute values of the x or y of the destination (relative to the robot) to increase over time (get farther from 0).
        In many cases, moving towards the destination corresponds to a high competence rating, low surprise rating, and high intention rating. Rotating at a fixed position and moving in a wrong direction usually corresponds to a lower competence rating, higher surprise rating, and lower intention rating.""")

    prompt += textwrap.dedent(f"""\
        Now, by learning from the examples given by other previous participants, """)

    prompt += textwrap.dedent(f"""\ 
        can you check the following example given next by a new human and guess how this human would rate the robot's overall performance in the 8 seconds'?\n{query}
        Please return a JSON containing your predicted ratings of the robot's competence, surprise, and intention in navigation using the following schema:
        'competence': int, 'surprise': int, 'intention': int
        Also, please provide explanations for your predictions.
        Note that your prediction is about the robot's overall performance in the given time window. Remember that your responses must be integers between 1 and 5. \n""")
    
    prompt += textwrap.dedent(f"""\
        Your predicted ratings of robot performance for the time window:""")
    
    return prompt
    
# import f1 score
from sklearn.metrics import f1_score, accuracy_score
from matplotlib import pyplot as plt

def compute_metrics(labeled_dataframe):
    bin_acc_dict = {
        'competence': {},
        'surprise': {},
        'intention': {}
    }
    bin_f1_dict = {
        'competence': {},
        'surprise': {},
        'intention': {}
    }
    f1_dict = {
        'competence': {},
        'surprise': {},
        'intention': {}
    }

    for num_examples in labeled_dataframe.num_examples.unique():
        print(f"\nNumber of examples: {num_examples}")
        labeled_dataframe_subset = labeled_dataframe[labeled_dataframe.num_examples == num_examples]

        for dimension in ['competence', 'surprise', 'intention']:
            gt = labeled_dataframe_subset[dimension].tolist()
            pt = labeled_dataframe_subset[f"pt_{dimension}"].tolist()

            gt = np.array(gt) - 1
            pt = np.array(pt) - 1

            if dimension != 'surprise':
                gt_binary = [1 if x <= 1 else 0 for x in gt]
                pt_binary = [1 if x <= 1 else 0 for x in pt]
            else:
                gt_binary = [1 if x >= 3 else 0 for x in gt]
                pt_binary = [1 if x >= 3 else 0 for x in pt]

            print("gt_binary", gt_binary)
            print("pt_binary", pt_binary)

            f1 = f1_score(gt, pt, average="macro", zero_division=0)
            bin_f1 = f1_score(gt_binary, pt_binary)
            bin_acc = accuracy_score(gt_binary, pt_binary)
            mae = np.abs(gt - pt).mean()
            print(f"F1 score for {dimension}, {num_examples} examples: {f1:.4f}")
            print(f"Binary F1 score for {dimension}, {num_examples} examples: {bin_f1:.4f}")
            print(f"Binary Accuracy for {dimension}, {num_examples} examples: {bin_acc:.4f}")
            print(f"MAE for {dimension}, {num_examples} examples: {mae:.4f}")

            f1_dict[dimension][num_examples] = f1
            bin_f1_dict[dimension][num_examples] = bin_f1
            bin_acc_dict[dimension][num_examples] = bin_acc
            
    return f1_dict, bin_f1_dict, bin_acc_dict

In [84]:
bootstrap_examples = 100

In [85]:
from time import sleep

labeled_rows = []
classifiers = []
used_examples = []
curr_count = 0

for participant in full_df.participant.unique():
    print(f"Participant: {participant}")
    
    participant_df = full_df[full_df.participant == participant]
    non_participant_df = full_df[full_df.participant != participant]
    # sort by id
    participant_df = participant_df.sort_values(by='id')

    for i in range(len(participant_df)):
        print(f"Example {curr_count+1} / {len(full_df)}")
        
        next_example = participant_df.iloc[i]
        next_example = (next_example.name, next_example)

        general_selected_examples, general_remaining_examples = get_examples(non_participant_df, bootstrap_examples)

        query = parse_query(next_example)
        prompt = build_prompt(general_selected_examples, query)

        print(f"Query:\n{query}") ##
        print(f"Ground Truth:\n{next_example[1].competence, next_example[1].surprise, next_example[1].intention}\n")

        try:
            result = model.generate_content(prompt)
            print(result.text + '\n') ##
            result_json = json.loads("{" + result.text.replace("'", '"').split("{")[1].rsplit("}")[0] + "}")
            print(f"Result:\n{result_json}\n")
            
            new_row = next_example[1].to_dict().copy()
            new_row["pt_competence"] = result_json["competence"]
            new_row["pt_surprise"] = result_json["surprise"]
            new_row["pt_intention"] = result_json["intention"]
            
            new_row["personalize_examples"] = i
            new_row['bootstrap_examples'] = bootstrap_examples

            new_row["bin_gt_competence"] = 1 if int(new_row["competence"]) <= 2 else 0
            new_row["bin_gt_surprise"] = 1 if int(new_row["surprise"]) >= 4 else 0
            new_row["bin_gt_intention"] = 1 if int(new_row["intention"]) <= 2 else 0

            new_row["bin_pt_competence"] = 1 if int(new_row["pt_competence"]) <= 2 else 0
            new_row["bin_pt_surprise"] = 1 if int(new_row["pt_surprise"]) >= 4 else 0
            new_row["bin_pt_intention"] = 1 if int(new_row["pt_intention"]) <= 2 else 0

            new_row["bin_correct_competence"] = int(new_row["bin_gt_competence"] == new_row["bin_pt_competence"])
            new_row["bin_correct_surprise"] = int(new_row["bin_gt_surprise"] == new_row["bin_pt_surprise"])
            new_row["bin_correct_intention"] = int(new_row["bin_gt_intention"] == new_row["bin_pt_intention"])

            labeled_rows.append(new_row)
            sleep(5)
        except Exception as e:
            print(e)
            sleep(5)
            continue

        curr_count += 1
        labeled_dataframe = pd.DataFrame(labeled_rows)
        labeled_dataframe['model'] = 'LLM'
        labeled_dataframe.rename(columns={"competence": "gt_competence", "surprise": "gt_surprise", "intention": "gt_intention"}, inplace=True)

Participant: 1715356072.5795307
Example 1 / 109
Query:
Destination (8 timesteps, 2 features): [[-9.5907,  0.4219], [-9.5869,  0.4193], [-9.5869,  0.4193], [-9.5864,  0.4173], [-9.5864,  0.4175], [-9.5867,  0.419 ], [-9.5867,  0.419 ], [-9.5871,  0.4199]]
Nearby pedestrian 1 (8 timesteps, 4 features): [[-1.1599, -0.5517, -0.1146, -0.9934], [-0.7064, -0.8786,  0.7669, -0.6418], [-0.1131, -1.0852,  0.8031, -0.5959], [ 0.379 , -1.1272,  0.6098,  0.7926], [ 0.9281, -0.8327,  0.5244,  0.8515], [ 1.1823, -0.1889, -0.9554,  0.2955], [ 0.9063,  0.1831, -0.9591,  0.2831], [ 0.7216,  0.7366, -0.7504,  0.661 ]]
Human Follower (8 timesteps, 4 features): [[-1.1599, -0.5517, -0.1146, -0.9934], [-0.7064, -0.8786,  0.7669, -0.6418], [-0.1131, -1.0852,  0.8031, -0.5959], [ 0.379 , -1.1272,  0.6098,  0.7926], [ 0.9281, -0.8327,  0.5244,  0.8515], [ 1.1823, -0.1889, -0.9554,  0.2955], [ 0.9063,  0.1831, -0.9591,  0.2831], [ 0.7216,  0.7366, -0.7504,  0.661 ]]
Robot Orientation (8 timesteps, 2 features): [

In [86]:
labeled_dataframe = pd.DataFrame(labeled_rows)
labeled_dataframe.to_csv("/Users/kennethshui/Desktop/llm_RQ1_kenneth.csv", index=False)

In [87]:
labeled_dataframe = pd.read_csv("/Users/kennethshui/Desktop/llm_RQ1_kenneth.csv")

for dimension in ['competence', 'surprise', 'intention']:
    print(dimension, "overall accuracy:", labeled_dataframe[f'bin_correct_{dimension}'].mean())
    bin_f1s = []
    bin_accs = []
    for participant in labeled_dataframe.participant.unique():
        participant_df = labeled_dataframe[labeled_dataframe.participant == participant]
        bin_f1 = f1_score(participant_df[f'bin_gt_{dimension}'], participant_df[f'bin_pt_{dimension}'], zero_division=1)
        bin_acc = accuracy_score(participant_df[f'bin_gt_{dimension}'], participant_df[f'bin_pt_{dimension}'])
        bin_f1s.append(bin_f1)  
        bin_accs.append(bin_acc)

    print(dimension, "average f1:", np.mean(bin_f1s), np.std(bin_f1s))
    print(dimension, "average acc:", np.mean(bin_accs), np.std(bin_accs))

competence overall accuracy: 0.5436893203883495
competence average f1: 0.47049689440993786 0.42940265556301815
competence average acc: 0.634575569358178 0.29994812851799557
surprise overall accuracy: 0.4174757281553398
surprise average f1: 0.29378881987577643 0.35777703468994554
surprise average acc: 0.4472049689440994 0.2849114429144867
intention overall accuracy: 0.5728155339805825
intention average f1: 0.4683323922454357 0.42115177039989843
intention average acc: 0.6244824016563146 0.30296906219613107
