In [16]:
from my_funcs import (
    default_transformation, read_json_from_data_dir,
    normalize, embed_query_retrieve_examples,
    make_two_type_msg, get_python_chat_prompt, 
    parse_python_completion, update_dialogue_state, 
    compute_acc, calculate_token_f1, evaluate,
    DataOntologyNormalizer, Ontology,
    copy, defaultdict, random,
    tiktoken, SentenceTransformer,
)

import os
import json
import pandas as pd
from collections import Counter
from refpydst.prompt_formats.python.completion_parser import *


In [17]:
import openai
from my_openai_key import get_openai_key
openai.api_key = get_openai_key()

In [None]:
# Load dev data to get a query and train data to retrieve examples
with open('./bm25_log.json', 'r') as f:
    logs = json.load(f)
with open('../data/mw21_5p_train_v1.json', 'r') as f:
    train_dataset = json.load(f)

total_acc, total_f1 = 0, 0
jga_by_turn_id = defaultdict(list)  # use to record the accuracy
jga_by_dialog = defaultdict(list)  # use to record the accuracy
n_correct = 0
n_total = len(logs)

delta_hall_val, delta_hall_overwrite, delta_hall_total = [], [], []
delta_miss_delete, delta_miss_dontcare, delta_miss_confuse, delta_miss_total = [], [], [], []

for data_item in logs:
    pred = data_item['pred_slot_values']
    this_jga, this_acc, this_f1 = evaluate(pred, data_item['slot_values'])
    total_acc += this_acc
    total_f1 += this_f1
    if this_jga != 1:
        if data_item['error'] != []:
            # break
            try:
                exec(f'{data_item["error"][0][0]}.append(data_item["turn_id"])')
            except:
                continue

print(delta_hall_val.__len__(), delta_hall_overwrite.__len__(), delta_hall_total.__len__())
print(delta_miss_delete.__len__(), delta_miss_dontcare.__len__(), delta_miss_confuse.__len__(), delta_miss_total.__len__())


In [None]:
# Load dev data to get a query and train data to retrieve examples
with open("data/mw24_100p_dev_100_sampled.json", 'r') as f:
    dev_dataset = json.load(f)
with open('data/mw21_5p_train_v1.json', 'r') as f:
    train_dataset = json.load(f)

with open('outputs/runs/table4/5p/smapling_exp/split_v1_topk_bm_5_fs_5_0523_0315/running_log.json', 'r') as f:
    logs = json.load(f)

total_acc, total_f1 = 0, 0
jga_by_turn_id = defaultdict(list)  # use to record the accuracy
jga_by_dialog = defaultdict(list)  # use to record the accuracy
n_correct = 0
n_total = len(logs)

wrong, hard= [], []

for data_item in logs:
    pred = data_item['pred']
    this_jga, this_acc, this_f1 = evaluate(pred, data_item['slot_values'])
    total_acc += this_acc
    total_f1 += this_f1
    if this_jga!=1 and data_item.get('final_scores', None) is not None:
        if evaluate(data_item['pred_turn_slot_values'], data_item['turn_slot_values'])[0]!=1:
            hard.append(data_item)
        else:
            wrong.append(data_item)
len(wrong), len(hard), len(wrong)+len(hard)

In [None]:
with open('data/mw21_5p_train_v1.json', 'r') as f:
    train_data = json.load(f)
    
normalizer = DataOntologyNormalizer(
        Ontology.create_ontology(),
        # count labels from the train set
        supervised_set=train_data,
        # make use of existing surface form knowledge encoded in ontology.json, released with each dataset
        # see README.json within https://github.com/smartyfh/MultiWOZ2.4/raw/main/data/MULTIWOZ2.4.zip
        counts_from_ontology_file="src/refpydst/db/multiwoz/2.4/ontology.json"
)

In [None]:
def new_pred_with_more_sample(train_dataset, wrong_logs, num_query=60, num_shot=20, score_type='score_delta'):
    # sampling index
    wrong_sampled_idx = random.sample(range(len(wrong_logs)), num_query)

    df = {}
    for idx in wrong_sampled_idx:
        df[idx] = {}
        data_item = wrong_logs[idx]

        best_ex_id_score = sorted(data_item['final_scores'][score_type].items(), key=lambda x: x[1], reverse=True)[:num_shot]
        retrieved_examples = []
        for example_id, _ in best_ex_id_score:
            example = list(filter(lambda x: x["ID"]+'_turn_'+str(x['turn_id']) == example_id, train_dataset))[0]
            retrieved_examples.append(example)

        msg_chat, _ = get_python_chat_prompt(data_item, retrieved_examples)    

        args = {
            "model": 'gpt-3.5-turbo-0125',
            "messages": msg_chat,
            "max_tokens": 120,
            "logprobs": True,
            "temperature": 0.0,
            "stop": ['\n\n', '#', 'print('],
        }

        result = openai.chat.completions.create(**args)
        completions = dict(zip(
                [x.message.content for x in result.choices],
                [sum(token.logprob for token in x.logprobs.content) for x in result.choices]
            ))

        best_completion = max(completions, key=completions.get)
        best_completion = best_completion.strip().replace('agent.state.', '')

        predicted_slot_values = parse_python_completion(best_completion, data_item['last_slot_values'])
        predicted_slot_values = normalizer.normalize(predicted_slot_values)

        aggregate_slot_values = update_dialogue_state(data_item['last_slot_values'], predicted_slot_values) 
        aggregate_slot_values = {k: v.split('|')[0] for k, v in aggregate_slot_values.items()}

        pred_delta_slot_values = parse_python_completion(best_completion, data_item['last_turn_slot_values'])
        pred_delta_slot_values = normalizer.normalize(pred_delta_slot_values)

        data_item['new_completion'] = best_completion
        data_item['new_pred'] = aggregate_slot_values
        data_item['new_pred_delta'] = pred_delta_slot_values

        gold_slot_value = copy.deepcopy(data_item['slot_values'])
        for gold_key in gold_slot_value.keys():
            if '|' in gold_slot_value[gold_key]:
                gold_values = gold_slot_value[gold_key].split('|')
                if gold_key in aggregate_slot_values and aggregate_slot_values[gold_key] in gold_values:
                    gold_slot_value[gold_key] = aggregate_slot_values[gold_key]
        new_jga_full: int = 1 if aggregate_slot_values == gold_slot_value else 0

        gold_slot_value_delta = copy.deepcopy(data_item['turn_slot_values'])
        for gold_key in gold_slot_value_delta.keys():
            if '|' in gold_slot_value_delta[gold_key]:
                gold_values = gold_slot_value_delta[gold_key].split('|')
                if gold_key in pred_delta_slot_values and pred_delta_slot_values[gold_key] in gold_values:
                    for gold_value in gold_values:
                        if gold_value == pred_delta_slot_values[gold_key]:
                            gold_slot_value_delta[gold_key] = gold_value
        new_jga_delta = 1 if pred_delta_slot_values == gold_slot_value_delta else 0

        old_jga_delta = 1 if data_item['pred_turn_slot_values'] == copy.deepcopy(data_item['turn_slot_values']) else 0
        old_jga_full = 1 if data_item['pred'] == copy.deepcopy(data_item['slot_values']) else 0
        assert old_jga_full == 0

        df[idx].update({
            'ID': data_item['ID']+f'_turn_{data_item["turn_id"]}',
            'old_jga_full': old_jga_full,
            'new_jga_full': new_jga_full,
            'old_jga_delta': old_jga_delta,
            'new_jga_delta': new_jga_delta
        })
    df = pd.DataFrame(df).T
    df = df[['ID', 'old_jga_full', 'new_jga_full', 'old_jga_delta', 'new_jga_delta']]
    return df


In [None]:
# Query에 대해 retrieve & generate & evaluate
df = new_pred_with_more_sample(train_dataset=train_dataset, wrong_logs=hard, num_query=60, num_shot=20,score_type='score_delta')

In [None]:
df['new_jga_delta'].sum()