In [21]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval

In [None]:
#epic-kitchens-100-annotations dataset
def columnStringToInt(row):
    strat_index = row['narration_id'].rindex('_') + 1
    return int(row['narration_id'][strat_index:len(row['narration_id'])])

def preprocess_raw_data_to_create_prompts(file_name:str):
    prompt = []
    separator = '=>'
    df = pd.read_csv(file_name)
    df['narration_id_int'] = df.apply(lambda row: columnStringToInt(row), axis=1)
    participants_ids = df['participant_id'].unique()
    for participant_id in participants_ids:
        filtered_df_by_participant_id = df[df['participant_id'] == participant_id] 
        video_ids = filtered_df_by_participant_id['video_id'].unique()
        start_index = video_ids[0].index('_') + 1
        video_ids_modified = [int(video_ids[i][start_index:len(video_ids[i])]) for i in range(len(video_ids))]
        video_ids_modified = np.sort(video_ids_modified)
        for video_id_int in video_ids_modified:
            video_id_str = ''
            if video_id_int <= 9:
                video_id_str = '0' + str(video_id_int)
            else:
                video_id_str = str(video_id_int)
            video_id_str = participant_id + "_" + video_id_str
            filtered_df_by_video_id = filtered_df_by_participant_id[filtered_df_by_participant_id['video_id'] == video_id_str] 
            sorted_df_by_narration_id = filtered_df_by_video_id.sort_values(by=['narration_id_int'])
            prompt_str = separator.join(sorted_df_by_narration_id['narration'])
            prompt.append(prompt_str)
        
    return prompt

In [None]:
train_file_name = 'epic_kitchens_train_data.csv'
train_prompts = preprocess_raw_data_to_create_prompts(train_file_name)
with open('epic_kitchens_train_prompt.txt', 'w') as f:
    for prompt in train_prompts:
        f.write(f"{prompt}\n")

validation_file_name = 'epic_kitchens_validation_data.csv'
validation_prompts = preprocess_raw_data_to_create_prompts(validation_file_name)
with open('epic_kitchens_validation_prompt.txt', 'w') as f:
    for prompt in validation_prompts:
        f.write(f"{prompt}\n")

test_file_name = 'epic_kitchens_test_data.csv'
test_prompts = preprocess_raw_data_to_create_prompts(test_file_name)
with open('epic_kitchens_test_prompt.txt', 'w') as f:
    for prompt in test_prompts:
        f.write(f"{prompt}\n")

In [None]:
#VirtualHome datatset
separator = '=>'
prompts = []

with open('virtual_home_actions.json', 'r') as f:
    raw_action_list = json.load(f)

actions_list = [raw_action.split('\n') for raw_action in raw_action_list]
for action_list in actions_list:
    raw_steps = [action.split('\n') for action in action_list]
    steps = []
    for raw_step in raw_steps:
        start_index = raw_step[0].index(':') + 1
        end_index = len(raw_step[0])
        steps.append(raw_step[0][start_index: end_index].strip())
    steps=steps[1:len(steps)]
    prompt_str = separator.join(steps)
    prompts.append(prompt_str)

In [None]:
train_prompt_file_name = 'virtual_home_train_prompt.txt'
validation_prompt_file_name = 'virtual_home_validation_prompt.txt'
test_prompt_file_name = 'virtual_home_test_prompt.txt'

#train_validation_test_split = 60,20,20
total = len(prompts)
train_percentage = 60
validation_percentage = 20
test_percentage = 20
index_train = int(total * train_percentage / 100)
index_validation = int(total * validation_percentage / 100)
index_text = int(total * test_percentage / 100)

train_prompts = prompts[0:index_train]
validation_prompts = prompts[index_train + 1: index_train + index_validation]
test_prompts = prompts[index_train + index_validation + 1: total]


with open(train_prompt_file_name, 'w') as f:
    for prompt in train_prompts:
        f.write(f"{prompt}\n")

with open(validation_prompt_file_name, 'w') as f:
    for prompt in validation_prompts:
        f.write(f"{prompt}\n")
        
with open(test_prompt_file_name, 'w') as f:
    for prompt in test_prompts:
        f.write(f"{prompt}\n")

In [55]:
#Fine-Grained-R2R
def preprocess_raw_json_data_to_create_prompts(file_name):
    prompts = []
    separator = '=>'
    
    with open(file_name, 'r') as f:
        raw_action_list = json.load(f)

    for raw_action in raw_action_list:
        new_instructions = literal_eval(raw_action['new_instructions'])
        steps = []
        for instruction in new_instructions:
            temp_instruction = ' '.join(instruction[0])
            steps.append(temp_instruction)
        prompt_str = separator.join(steps)
        prompts.append(prompt_str)
    return prompts 


In [58]:
train_file_name_Fine_Grained_R2R = 'FGR2R_train.json'
test_file_name_Fine_Grained_R2R = 'FGR2R_test.json'
validation_file_name_Fine_Grained_R2R = 'FGR2R_validation.json'

train_prompts = preprocess_raw_json_data_to_create_prompts(train_file_name_Fine_Grained_R2R)
test_prompts = preprocess_raw_json_data_to_create_prompts(test_file_name_Fine_Grained_R2R)
validation_prompts = preprocess_raw_json_data_to_create_prompts(validation_file_name_Fine_Grained_R2R)

train_prompt_Fine_Grained_R2R_file_name = 'FGR2R_train_prompt.txt'
validation_prompt_Fine_Grained_R2R_file_name = 'FGR2R_validation_prompt.txt'
test_prompt_file_Fine_Grained_R2R_name = 'FGR2R_test_prompt.txt'

with open(train_prompt_Fine_Grained_R2R_file_name, 'w') as f:
    for prompt in train_prompts:
        f.write(f"{prompt}\n")

with open(validation_prompt_Fine_Grained_R2R_file_name, 'w') as f:
    for prompt in validation_prompts:
        f.write(f"{prompt}\n")
        
with open(test_prompt_file_Fine_Grained_R2R_name, 'w') as f:
    for prompt in test_prompts:
        f.write(f"{prompt}\n")