# Extract data from the LaTTe's 100k dataset


In [13]:
base_path = "/home/janne/ros_ws/src/LaTTe/data"
# base_path = "/home/janne/ros_ws/language/src/LaTTe/data"


In [1]:
import json
file_path = base_path + "/datalatte_100k_lf.json"

# Import data
with open(file_path, "r") as f:
    data = json.load(f)

In [2]:
print(len(data))
print(data["0"])
print(data["0"].keys())

100000
{'input_traj': [[0.29524648098151074, -0.005368213565017599, -0.10639531430622262, 0.30000000000000004], [0.27381279643114875, 0.08191928084317257, -0.04638646207348601, 0.19934730526173594], [0.24611539136683902, 0.15281377277132036, 0.00314310608170576, 0.11027115407076604], [0.2130720465233349, 0.20841837092767102, 0.042970152019565, 0.032196782921920075], [0.17560054263539038, 0.24983618402046925, 0.07387143760030368, -0.035450571689972565], [0.13461866043775916, 0.27817032075796, 0.09662372468413394, -0.09324567327008226], [0.09104418066519482, 0.2945238898483884, 0.11200377513126847, -0.1417632853235794], [0.045794884052451135, 0.30000000000000004, 0.12078835080191896, -0.18157817135563453], [-0.00021144866571820575, 0.2957017599210392, 0.1237542135562979, -0.21326509487141826], [-0.046057036754559355, 0.28273227831975145, 0.1216781252546173, -0.23739881937610063], [-0.09082409947931858, 0.2621946639043815, 0.11533684775708952, -0.2545541083748525], [-0.13359485610524235, 

## Extract all the texts/corrections from dataset

In [6]:
#################################
# Extracting text from data set #
#################################
corpus = []
for d in data:
    text = data[d]["text"]
    corpus.append(text)

with open(base_path + "/data_latte_100k_text.json", "w") as f:
    json.dump(corpus, f)

## Checking objects

In [17]:
#####################################
# Extracting objects from text data #
#####################################
all_objects = []
for d in data:
    obj = data[d]["obj_names"]
    for o in obj:
        all_objects.append(o)
print("Length of object list: ", len(all_objects))

# Remove duplicates
all_objects = list(set(all_objects))
print("Length of object list: ", len(all_objects))

with open(base_path + "/data_latte_100k_objects.json", "w") as f:
    json.dump(all_objects, f)

Length of object list:  400230
Length of object list:  1840


In [18]:
#########################################
# Checking is object exists in data set #
#########################################
with open(base_path + "/data_latte_100k_obj_list.json", "r") as f:
    obj_list = json.load(f)
    
# bottle - beer bottle, soda bottle, water bottle, wine bottle
objects_to_check = ["plate", "water bottle", "bottle", "fork", "coffee", "apple", "egg", "cake", "knife"]
for obj in objects_to_check:
    if obj in obj_list:
        print(obj, "in list")
    else:
        print(obj, "NOT in list")

plate in list
water bottle in list
bottle NOT in list
fork NOT in list
coffee NOT in list
apple NOT in list
egg NOT in list
cake NOT in list
knife NOT in list


## Sampling a smaller dataset for testing

In [2]:
idx = 0
small_dataset_size = 20

small_dataset = {}
for d in data:
    if idx < small_dataset_size:
        small_dataset[d] = data[d]
    else:
        break
    idx += 1

for d in small_dataset:
    print(small_dataset[d]["change_type"])

data_set_name = f"data_latte_{len(small_dataset)}.json"

with open(base_path + data_set_name, "w") as f:
    json.dump(small_dataset, f, indent=4)


NameError: name 'data' is not defined

# Generate validation data
- Include `intensity` and `direction` labels in the dataset
  

In [15]:
import json

# data_set_name = "data_latte_20.json"
data_set_name = "datalatte_100k_lf.json"

In [17]:
# Load the dataset
with open(base_path + "/" + data_set_name, "r") as f:
    data = json.load(f)

list(data["0"].keys())

['input_traj',
 'output_traj',
 'text',
 'obj_names',
 'obj_poses',
 'obj_classes',
 'obj_in_text',
 'change_type',
 'map_id',
 'image_paths',
 'locality_factor',
 'token_text',
 'similarity']

In [18]:
# Key phrases from latte's dataset generator
DIST_INC = ["further away from", "keep a bigger distance from the"]
DIST_DEC = ["closer to the", "keep a smaller distance from the"]

INTENSITY_LOW = ["a bit", "a little"]
INTENSITY_HIGH = ["a lot", "much", "very"]

def check_distance_direction(sentence):
    if any(phrase in sentence for phrase in DIST_INC):
        return "INCREASE"
    elif any(phrase in sentence for phrase in DIST_DEC):
        return "DECREASE"
    return "-"

def check_intensity(sentence):
    if any(phrase in sentence for phrase in INTENSITY_LOW):
        return "LOW"
    elif any(phrase in sentence for phrase in INTENSITY_HIGH):
        return "HIGH"
    return "NEUTRAL"

In [19]:
# X, Y should correspond to our evaluation labels in `calc_accuracy.py`
# The direction is changed accordingly there
Y_CART_INC = ["right"]
Y_CART_DEC = ["left"]
X_CART_INC = ["front"]
X_CART_DEC = ["back"]
Z_CART_INC = ["top", "upper part"]
Z_CART_DEC = ["down", "bottom", "bottom part"]

def check_cartesian_direction(sentence):
    if any(phrase in sentence for phrase in Y_CART_INC):
        return "Y", "INCREASE"
    elif any(phrase in sentence for phrase in Y_CART_DEC):
        return "Y", "DECREASE"
    elif any(phrase in sentence for phrase in X_CART_INC):
        return "X", "INCREASE"
    elif any(phrase in sentence for phrase in X_CART_DEC):
        return "X", "DECREASE"
    elif any(phrase in sentence for phrase in Z_CART_INC):
        return "Z", "INCREASE"
    elif any(phrase in sentence for phrase in Z_CART_DEC):
        return "Z", "DECREASE"
    return "-", "UNKNOWN"

In [22]:
SPEED_INC = ["faster", "increase the speed"]
SPEED_DEC = ["slower", "reduce the speed"]

def check_speed_direction(sentence):
    if any(phrase in sentence for phrase in SPEED_INC):
        return "INCREASE"
    elif any(phrase in sentence for phrase in SPEED_DEC):
        return "DECREASE"
    return "-"

In [23]:
def get_dynamic_features(feature):
    ft_parts = feature.split("_")

    # return both increase and decrease
    ft_1 = "_".join(ft_parts[:-1])      # joins everything except the last part

    if ft_parts[-1] == "increase":
        dir_new = "decrease"
    elif ft_parts[-1] == "decrease":
        dir_new = "increase"
    
    features = [feature, ft_1 + "_" + dir_new]
    return sorted(features)

In [24]:
# Change key names to match our evaluation labels
for d in data:
    data[d]["initial_traj"] = data[d].pop("input_traj")
    data[d]["gt_output_traj"] = data[d].pop("output_traj")
    data[d]["gt_target_object"] = [data[d].pop("obj_in_text")]

In [25]:
for d in data:    
    # Add new keys
    if data[d]['change_type'] == "dist":
        data[d]['gt_direction'] = [check_distance_direction(data[d]['text'])]
        data[d]['gt_intensity'] = [check_intensity(data[d]['text'])]
        data[d]['gt_cart_axes'] = ["-"]
        data[d]['gt_change_type'] = ["distance"]
        data[d]['gt_feature'] = [data[d]["gt_target_object"][0] + "_distance_" + data[d]["gt_direction"][0].lower()]

    
    elif data[d]['change_type'] == "cartesian":
        cart_axes, direction = check_cartesian_direction(data[d]['text'])
        data[d]['gt_cart_axes'] = [cart_axes]
        data[d]['gt_direction'] = [direction]
        data[d]['gt_intensity'] = ["-" ]
        data[d]['gt_change_type'] = ["cartesian"]
        data[d]['gt_feature'] = [data[d]["gt_cart_axes"][0] + "_cartesian_" + data[d]["gt_direction"][0].lower()]


    elif data[d]['change_type'] == "speed":
        data[d]['gt_direction'] = [check_speed_direction(data[d]['text'])]
        data[d]['gt_intensity'] = [check_intensity(data[d]['text'])]
        data[d]['gt_cart_axes'] = ["-"]
        data[d]['gt_change_type'] = ["speed"]
        data[d]['gt_feature'] = [data[d]["gt_target_object"][0] + "_speed_" + data[d]["gt_direction"][0].lower()]

    # Add keys relating to features
    data[d]['gt_dynamic_features'] = get_dynamic_features(data[d]['gt_feature'][0])
    
    # Create keys for prompt evaluations
    data[d]['gt_split'] = [data[d]['text']]

    # TODO: update image paths

    # print(f"{data[d]['text']:<60} | {data[d]['gt_feature'][0]:^40} | {data[d]['gt_change_type'][0]:^10} | {data[d]['gt_cart_axes'][0]:^3} | {data[d]['gt_direction'][0]:^8} | {data[d]['gt_intensity'][0]:^8}")


In [26]:
# Print out data to check

for d in data["0"]:
    print (d, ":", data["0"][d])



text : stay closer to the Egyptian cat
obj_names : ['acoustic guitar', 'RV', 'trolley', 'minibus', 'Egyptian cat', 'European fire salamander']
obj_poses : [[-0.16723265307625768, 0.06511611437603326, -0.2140565027826487], [0.26284862657619623, 0.15670515978207278, -0.23528697921312614], [-0.09024289540346142, -0.060834785513723455, -0.2935970481219493], [0.025734782714433413, -0.2316987307641093, 0.16114266945706612], [0.07547948374778946, 0.02328983279775909, 0.0005649966608154422], [0.22303722987414787, -0.07094117787220988, 0.10933703314772436]]
obj_classes : ['402', '757', '829', '654', '285', '25']
change_type : dist
map_id : 0
image_paths : ['/home/arthur/data/image_dataset//402/acoustic guitar/Image_3.jpg', '/home/arthur/data/image_dataset//757/RV/Image_1.jpg', '/home/arthur/data/image_dataset//829/trolley/Image_1.jpg', '/home/arthur/data/image_dataset//654/minibus/Image_5.jpg', '/home/arthur/data/image_dataset//285/Egyptian cat/Image_1.jpg', '/home/arthur/data/image_dataset//25

In [29]:
# Store new json file
with open(base_path + "/our_labels_" + data_set_name, "w") as f:
    json.dump(data, f, indent=2)

## Split dataset into smaller files

In [33]:
import json

with open(base_path + "/our_labels_datalatte_100k_lf.json", "r") as f:
    data = json.load(f)


list(data["0"].keys())

['text',
 'obj_names',
 'obj_poses',
 'obj_classes',
 'change_type',
 'map_id',
 'image_paths',
 'locality_factor',
 'token_text',
 'similarity',
 'initial_traj',
 'gt_output_traj',
 'gt_target_object',
 'gt_direction',
 'gt_intensity',
 'gt_cart_axes',
 'gt_change_type',
 'gt_feature',
 'gt_dynamic_features',
 'gt_split']

In [32]:
num_files = 20

# Convert into a list
items = list(data.items())
chunk_size = len(data) // num_files

for i in range(num_files):
    start_index = i * chunk_size
    end_index = (i + 1) * chunk_size if i != num_files - 1 else len(data)  # Ensure the last chunk gets any remaining item

    print(start_index, end_index)

    chunk_data = dict(items[start_index:end_index])

    with open(f"{base_path}/our_labels_datalatte_100k_lf_{i}.json", "w") as f:
        json.dump(chunk_data, f, indent=2)


0 5000
5000 10000
10000 15000
15000 20000
20000 25000
25000 30000
30000 35000
35000 40000
40000 45000
45000 50000
50000 55000
55000 60000
60000 65000
65000 70000
70000 75000
75000 80000
80000 85000
85000 90000
90000 95000
95000 100000


In [47]:
# check the data validity
import numpy as np
for idx in range(57876,100000):
    np_traj = np.array(data[str(idx)]["gt_output_traj"])
    is_nan = np.isnan(np_traj).any()
    if is_nan:
        print(idx)

# Only data of idx 57876 has an invalid trajectory

57876
