In [None]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    
install('transformers==4.18.0')

import torch

import os
import json
import numpy as np

from transformers import AutoTokenizer, AutoModel

In [None]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
def file_name_2_coco_path(memory_graph, dialog): # 전체 memory_graph, dialog를 input으로 받아옴        
    memory_image_path_dict = dict()
    for d in dialog:
        memory_graph_id = d['memory_graph_id']
        cur_memory_graph = memory_graph[memory_graph_id]
        for m in cur_memory_graph['memories']:
            file_name = m['media'][0]['file_name']
            memory_image_path_dict[file_name] = m['media'][0]['media_id'] # {####.jpg : coco_path} 형태       

    return memory_image_path_dict

In [None]:
def get_memory_id_day_dict(memory_graph): # memory_graph : key = memory_graph_id
    memory_id_day_dict = dict()
    for key in memory_graph.keys(): # memory graph id = key
        cur_memory_graph = memory_graph[key]
        for trip in cur_memory_graph['memory_groups']:
            for day in trip['days']:
                day_id = day['day_id'] # day_id = day
                for event in day['events']:
                    memories_idx_list = event['memories']
                    for memory_idx in memories_idx_list:
                        cur_memory = cur_memory_graph['memories'][memory_idx]
                        cur_memory_id = cur_memory['memory_id']

                        memory_id_day_dict[f'{key}_{cur_memory_id}'] = day_id
                        
    return memory_id_day_dict

In [None]:
def get_memory_id_trip_day_dict(memory_graph): # memory_graph : key = memory_graph_id
    memory_id_trip_day_dict = dict()
    for key in memory_graph.keys(): # memory graph id = key
        cur_memory_graph = memory_graph[key]
        for trip in cur_memory_graph['memory_groups']:
            trip_id = trip['trip_id']
            for day in trip['days']:
                day_id = day['day_id'] # day_id = day
                for event in day['events']:
                    memories_idx_list = event['memories']
                    for memory_idx in memories_idx_list:
                        cur_memory = cur_memory_graph['memories'][memory_idx]
                        cur_memory_id = cur_memory['memory_id']

                        memory_id_trip_day_dict[f'{key}_{cur_memory_id}'] = f'{trip_id}_{day_id}'
                        
    return memory_id_trip_day_dict

In [None]:
def get_memory_id_list(dialog_idx, turn_idx, dialog, memory_graph, memory_id_day_dict):
    cur_memory_graph_id = dialog[dialog_idx]['memory_graph_id']
    cur_memory_graph = memory_graph[cur_memory_graph_id]
    cur_turn_memory_list = dialog[dialog_idx]['dialogue'][turn_idx]['system_transcript_annotated'][0]['act_attributes']['memories']
    
    if len(cur_turn_memory_list) > 0: # gt memory가 있을 경우에는 day 기준
        cur_turn_day_list = [memory_id_day_dict[f'{cur_memory_graph_id}_{x}'] for x in cur_turn_memory_list]
        cur_memory_graph_list = [k for k, v in memory_id_day_dict.items() if ((v <= max(cur_turn_day_list)) & (cur_memory_graph_id in k))]
    else: # 아닐경우에는 모든 memory 가져옴
        cur_memory_graph_list = [k for k, v in memory_id_day_dict.items() if (cur_memory_graph_id in k)]
        
    return cur_memory_graph_list

In [None]:
def get_attr_list(memory):
    attr_list = []
    
    # memory_id
    attr_list.append(f'memory_id_{memory["memory_id"]}')
    
    # time
    attr_list.append(memory['time'])
    
    # narrations
    attr_list.append(memory['narrations'])
    
#     # media
#     cur_media_name = memory['media'][0]['file_name']
#     cur_media = file_name_2_coco_dict[cur_media_name]
#     attr_list.append(cur_media) # media (jpg)
    
    # location flatten string
    loc_dict = memory['location']['geo_tag']
    cur_location = ' '.join([loc_dict['place'], loc_dict['place_type'], ' '.join(memory['location']['category']), loc_dict['city'], loc_dict['state'], loc_dict['country']])
    attr_list.append(cur_location) 
    
    # activity
    attr_list.append(memory['activity'][0]['activity_name'])
    
    # participant
    participant_list = memory['participant']
    if len(participant_list) > 0:
        cur_participant = [x['name'] for x in participant_list]
        cur_participant = ' '.join([i for i in cur_participant if i is not None])
        attr_list.append(cur_participant)
    
    # object
    object_list = memory['objects']
    if len(object_list) > 0:
        cur_object = ' '.join(object_list)
        attr_list.append(cur_object)
        
    # time_part
    attr_list.append(memory['time_part'])
    
    return attr_list

In [None]:
def get_memory_attr_list(memory_graph, graph_id, memory_id):
    cur_memory_graph = memory_graph[graph_id]['memories']
    for m in cur_memory_graph:
        if str(m['memory_id']) == memory_id:
            cur_memory = m
            break
     
    attr_list = get_attr_list(cur_memory)

    return attr_list

In [None]:
def calc_cos_similarity(question, mem_attr_list, tokenizer, model, option):
    sim_list = []
    
    # question
    enc_question = tokenizer(str(question), return_tensors="pt")
    with torch.no_grad():
        out_question = model(**enc_question)
    states_question = out_question.hidden_states[-1].squeeze()
    avg_question = states_question.mean(axis = 0)
    
    enc_attr = tokenizer(mem_attr_list, return_tensors="pt", padding = True)
    with torch.no_grad():
        out_attr = model(**enc_attr)
    states_attr = out_attr.hidden_states[-1].squeeze()
    avg_attr = states_attr.mean(axis = 0)

    sim_list = torch.cosine_similarity(avg_question, avg_attr, dim = 0).tolist()
    
    if option == 'memory':
        sim_list.sort(reverse = True)
        sim_score = np.array(sim_list[:5]).mean()
    elif option == 'attribute':
        sim_score = sim_list
    
    return sim_score

In [None]:
def get_random_sample(similarity_dict, option): # key = memory_id, value = similarity
    prob_array = np.array(list(similarity_dict.values()))
    normalized_prob_array = prob_array/sum(prob_array)
    
    if option == 'memory':
        size = 4
    elif option == 'attribute':
        size = 32
    return np.random.choice(list(similarity_dict.keys()), size = size, p = normalized_prob_array)

In [None]:
def get_memory_option_1(dialog_idx, memory_graph, dialog): # turn 상관없이 dialog 전체에서 보는 graph에서 sampling
    # memory_graph_id를 key값으로 사용하기 위해 변경
    for i in memory_graph:
        if i['memory_graph_id'] == dialog[dialog_idx]['memory_graph_id']:
            cur_memory_graph = i
            break
            
    num_memories = len(cur_memory_graph['memories'])    
    memory_idx_list = np.random.randint(low = 0, high = num_memories, size = 4)
    attr_sample = []
    for memory_idx in memory_idx_list:
        cur_memory = cur_memory_graph['memories'][memory_idx]
        
        attr_sample.append(get_attr_list(cur_memory))
        
    attr_sample = sum(attr_sample, [])
    
    return attr_sample

In [None]:
def get_memory_option_2(dialog_idx, turn_idx, memory_graph, dialog):
    # 현재 dialog에 해당하는 memory graph만 사용
    memory_graph_dict = dict()
    for i in memory_graph:
        if i['memory_graph_id'] == dialog[dialog_idx]['memory_graph_id']:
            memory_graph_dict[i['memory_graph_id']] = i
            break
    memory_graph = memory_graph_dict
            
    # {memory id : day} 로 저장
    memory_id_day_dict = get_memory_id_day_dict(memory_graph)
    
    # 특정 dialog - turn에서 보는 memory list
    cur_memory_graph_list = get_memory_id_list(dialog_idx, turn_idx, dialog, memory_graph, memory_id_day_dict)
    num_memories = len(cur_memory_graph_list)
    memory_idx_list = np.random.randint(low = 0, high = num_memories, size = 4)
    
    attr_sample = []
    
    for memory in cur_memory_graph_list:
        # memory 내의 attribute list로 반환
        graph_id = memory.split('_')[0]
        memory_id = memory.split('_')[1]
        
        # memory내의 attribute들을 list로 반환
        mem_attr_list = get_memory_attr_list(memory_graph, graph_id, memory_id)
        attr_sample.append(mem_attr_list)
        
    attr_sample = sum(attr_sample, [])
    
    return attr_sample

In [None]:
def get_memory_option_3(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model): # 1100개 memory sample, dialog 파일
    option = 'memory'
    # 현재 dialog에 해당하는 memory graph만 사용
    memory_graph_dict = dict()
    for i in memory_graph:
        if i['memory_graph_id'] == dialog[dialog_idx]['memory_graph_id']:
            memory_graph_dict[i['memory_graph_id']] = i
            break
    memory_graph = memory_graph_dict
    
    # {memory id : day} 로 저장
    memory_id_day_dict = get_memory_id_day_dict(memory_graph)
    
    # 특정 dialog - turn에서 보는 memory list
    cur_memory_graph_list = get_memory_id_list(dialog_idx, turn_idx, dialog, memory_graph, memory_id_day_dict)

    # 현재 turn의 question 가져오기
    question = dialog[dialog_idx]['dialogue'][turn_idx]['transcript_annotated'][0]['uttr']
    
    # walk 돌릴 확률 저장 {memory_id : similarity}
    prob_dict = dict()
    # attribute 저장 {memory_id : attribute}
    attr_dict = dict()
    memory_list = []
    sim_list = []

    # memory별로 for문 돌면서 similarity 계산
    for memory in cur_memory_graph_list:

        graph_id = memory.split('_')[0]
        memory_id = memory.split('_')[1]

        # memory내의 attribute들을 list로 반환
        mem_attr_list = get_memory_attr_list(memory_graph, graph_id, memory_id)
        attr_dict[memory] = mem_attr_list

    # memory와 question similarity 계산
    for memory in attr_dict.keys():
        mem_attr_list = attr_dict[memory]
        sim = calc_cos_similarity(question, mem_attr_list, tokenizer, model, option)
        memory_list.append(memory)
        sim_list.append(sim)
    
    prob_list = softmax(sim_list)
    for k, v in zip(memory_list, prob_list):
        prob_dict[k] = v
        
    # sampling
    memory_sample = get_random_sample(prob_dict, option)
    attr_sample = [attr_dict[key] for key in memory_sample]
    
    # flatten
    attr_sample = sum(attr_sample, [])
    
    return attr_sample

In [None]:
def get_memory_option_4(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model):
    option = 'attribute'
    # 현재 dialog에 해당하는 memory graph만 사용
    memory_graph_dict = dict()
    for i in memory_graph:
        if i['memory_graph_id'] == dialog[dialog_idx]['memory_graph_id']:
            memory_graph_dict[i['memory_graph_id']] = i
            break
    memory_graph = memory_graph_dict
    
    # {memory id : day} 로 저장
    memory_id_day_dict = get_memory_id_day_dict(memory_graph)
    
    # 특정 dialog - turn에서 보는 memory list
    cur_memory_graph_list = get_memory_id_list(dialog_idx, turn_idx, dialog, memory_graph, memory_id_day_dict)

    # 현재 turn의 question 가져오기
    question = dialog[dialog_idx]['dialogue'][turn_idx]['transcript_annotated'][0]['uttr']
    
    # walk 돌릴 확률 저장 {attr : similarity}
    prob_dict = dict()
    attr_list = []
    sim_list = []

    # memory별로 for문 돌면서 similarity 계산
    for memory in cur_memory_graph_list:

        graph_id = memory.split('_')[0]
        memory_id = memory.split('_')[1]

        # memory내의 attribute들을 list로 반환
        mem_attr_list = get_memory_attr_list(memory_graph, graph_id, memory_id)
        attr_list.append(mem_attr_list)
        
    # attribute와 question similarity 계산
    mem_attr_list_flatten = sum(attr_list, [])
    sim_list = calc_cos_similarity(question, mem_attr_list_flatten, tokenizer, model, option)
    prob_list = softmax(sim_list)
        
    attr_list = sum(attr_list, [])
        
    for k, v in zip(attr_list, prob_list):
        prob_dict[k] = v
        
    # sampling
    attr_sample = list(get_random_sample(prob_dict, option))
    
    return attr_sample

In [None]:
def get_memory_option_6(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model):
    option = 'memory'
    memory_graph_id = dialog[dialog_idx]['memory_graph_id']
    
    # 현재 dialog에 해당하는 memory graph만 사용
    memory_graph_dict = dict()
    for i in memory_graph:
        if i['memory_graph_id'] == memory_graph_id:
            memory_graph_dict[i['memory_graph_id']] = i
            break
    memory_graph = memory_graph_dict

    # {memory id : trip_day} 으로 저장
    memory_id_trip_day_dict = get_memory_id_trip_day_dict(memory_graph)

    # 현재 turn의 question 가져오기
    question = dialog[dialog_idx]['dialogue'][turn_idx]['transcript_annotated'][0]['uttr']

    t_d_sim_dict = dict()

    # trip-day 선택
    t_d_set = set([x for x in list(memory_id_trip_day_dict.values())])

    for t_d in t_d_set:
        attr_dict = dict()
        sim_list = []

        cur_memory_list = [k for k, v in memory_id_trip_day_dict.items() if (t_d == v)]

        # memory별로 for문 돌면서 similarity 계산
        for memory in cur_memory_list:

            graph_id = memory.split('_')[0]
            memory_id = memory.split('_')[1]

            # memory내의 attribute들을 list로 반환
            mem_attr_list = get_memory_attr_list(memory_graph, graph_id, memory_id)
            attr_dict[memory] = mem_attr_list

        # memory와 question similarity 계산
        for memory in attr_dict.keys():
            mem_attr_list = attr_dict[memory]
            sim = calc_cos_similarity(question, mem_attr_list, tokenizer, model, option)
            sim_list.append(sim)

        t_d_sim_dict[t_d] = np.mean(sim_list)

    max_similarity = np.max(list(t_d_sim_dict.values()))
    cur_t_d = next(k for k, v in t_d_sim_dict.items() if v == max_similarity)
    cur_mem_list = [k for k, v in memory_id_trip_day_dict.items() if (cur_t_d == v)]

    # memory내의 attribute들을 list로 반환
    attr_sample = []
    memory_id_list = [x.split('_')[1] for x in cur_memory_list]
    for m in memory_id_list:
        attr_sample.append(get_memory_attr_list(memory_graph, memory_graph_id, m))

    attr_sample = sum(attr_sample, [])

    return attr_sample

In [None]:
comet_data_dir = '/home/work/.data/comet_memory_dialog/data' # dialog, memory graph 있는 위치

memory_graph = json.load(open(
    os.path.join(comet_data_dir, 'mscoco_memory_graphs_1k.json')
)) + json.load(open(
    os.path.join(comet_data_dir, 'memory_may21_v1_100graphs.json')
))

split = 'train' # train/test/val
dialog = json.load(open(
    os.path.join(comet_data_dir, f'mem_dials_{split}.json')
))['dialogue_data']

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states = True).eval()

In [None]:
import time

In [None]:
start = time.time()
for d_idx, d in enumerate(dialog[:1]):
    for t_idx, t in enumerate(d['dialogue']):
        get_memory_option_1(0, memory_graph, dialog)
end = time.time()

print(end-start)

In [None]:
start = time.time()
for dialog_idx, d in enumerate(dialog[:1]):
    for turn_idx, t in enumerate(d['dialogue']):
        get_memory_option_2(dialog_idx, turn_idx, memory_graph, dialog)
end = time.time()

print(end-start)

In [None]:
start = time.time()
for dialog_idx, d in enumerate(dialog[:1]):
    for turn_idx, t in enumerate(d['dialogue']):
        get_memory_option_3(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model)
end = time.time()

print(end-start)

In [None]:
start = time.time()
for dialog_idx, d in enumerate(dialog[:1]):
    for turn_idx, t in enumerate(d['dialogue']):
        get_memory_option_4(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model)
end = time.time()

print(end-start)

In [None]:
start = time.time()
for dialog_idx, d in enumerate(dialog[:1]):
    for turn_idx, t in enumerate(d['dialogue']):
        get_memory_option_6(dialog_idx, turn_idx, memory_graph, dialog, tokenizer, model)
end = time.time()

print(end-start)

In [1]:
import os

file_path = '/data/data2/khahn/comet_memory_dialog_former/update_memory_graph/created_memory/train/option_3/option_3_116.pkl'  # pickle 파일의 경로

# 파일의 크기를 바이트 단위로 얻기
file_size = os.path.getsize(file_path)

print(f"파일 용량: {file_size} 바이트")


파일 용량: 3220 바이트


In [2]:
import os

file_path = '/data/data2/khahn/comet_memory_dialog_former/update_memory_graph/created_memory/train/option_3/option_3_116.pkl'  # pickle 파일의 경로

# 파일의 크기를 바이트 단위로 얻기
file_size = os.path.getsize(file_path)

# 바이트를 기가바이트로 환산
file_size_gb = file_size / (1024 ** 3)

print(f"파일 용량: {file_size_gb:.2f} GB")


파일 용량: 0.00 GB
