In [1]:
import os
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset



In [2]:
# 총 몇 개의 ID가 있는지 파악
name = []
for i in range(len(data)):
    name.append(data[i]["ID"])

print(f"Total ID count: {len(set(name))}")

# 평균적으로 한 ID당 turn 개수
from collections import Counter
count = Counter(name)
avg = sum(count.values())/len(count)
print(f"avg of turns per ID: {avg}")

Total ID count: 200
avg of turns per ID: 7.235


In [240]:
# belief state 딕셔너리 형식
def make_questions_bf_dict(data):
    questions = []
    for i in range(len(data)):
        dial_sys = data[i]['dialog']['sys'][-1]
        dial_usr = data[i]['dialog']['usr'][-1]
        belief = str(data[i]['last_slot_values'])
        dial = "[belief] " + belief + " [system] " + dial_sys + " [user] " + dial_usr
        questions.append(dial)
    return questions


In [294]:
# belief state 문장 형식
def make_questions(data):
    questions = []
    for i in range(len(data)):
        dial_sys = data[i]['dialog']['sys'][-1]
        dial_usr = data[i]['dialog']['usr'][-1]

        items = [f"{key} is {value}" for key, value in data[i]['last_slot_values'].items()]
        belief = ", ".join(items)
        if belief=="":
            belief="no belief state"

        dial = "[belief] " + belief + " [system] " + dial_sys + " [user] " + dial_usr
        questions.append(dial)
    return questions

questions = make_questions(data)

In [287]:
def make_chosen(data):
    chosen = []
    for i in range(len(data)):
        dial_sys = data[i]['best_example'][0]['dialog']['sys'][-1]
        dial_usr = data[i]['best_example'][0]['dialog']['usr'][-1]
        items = [f"{key} is {value}" for key, value in data[i]['best_example'][0]['last_slot_values'].items()]
        belief = ", ".join(items)
        if belief=="":
            belief="no belief state"
        dial = "[belief] " + belief + " [system] " + dial_sys + " [user] " + dial_usr
        chosen.append(dial)
    return chosen

chosen = make_chosen(data)

In [284]:
import random

def make_rejected(data):
    fullset = [f"{item['ID']}_turn_{item['turn_id']}" for item in data]
    rejected = []
    for i in range(len(data)):
        best100 = data[i]['sampling_exp']['scores'][0]['occurence'].keys()
        negative_set = [id_turn for id_turn in fullset if id_turn not in best100]
        negative_one = random.choice(negative_set)

        for idx, turn in enumerate(data):
            if f"{turn['ID']}_turn_{turn['turn_id']}" == negative_one:
                break

        dial_sys = data[idx]['dialog']['sys'][-1]
        dial_usr = data[idx]['dialog']['usr'][-1]
        items = [f"{key} is {value}" for key, value in data[idx]['last_slot_values'].items()]
        belief = ", ".join(items)
        if belief=="":
            belief="no belief state"
        dial = "[belief] " + belief + " [system] " + dial_sys + " [user] " + dial_usr
        rejected.append(dial)
    return rejected

rejected = make_rejected(data)

In [301]:
comb = {'question':questions,
      'chosen':chosen,
      'rejected':rejected}
df = pd.DataFrame(comb)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
print(f"train size: {len(train_df)}, eval size: {len(eval_df)}")


train size: 1157, eval size: 290


In [303]:
train_df.to_parquet('train_data.parquet', engine='pyarrow')
eval_df.to_parquet('eval_data.parquet', engine='pyarrow')

train_dataset = Dataset.from_parquet('train_data.parquet')
eval_dataset = Dataset.from_parquet('eval_data.parquet')

dataset_dict = DatasetDict({
    'train': train_dataset,
    'eval': eval_dataset
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [307]:
# Push the dataset to Hugging Face Hub
# dataset_dict.push_to_hub('anthj/dpo_mw')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/anthj/dpo_mw/commit/2e6e5b8cb61703a82afa75897545dbb492e9340b', commit_message='Upload dataset', commit_description='', oid='2e6e5b8cb61703a82afa75897545dbb492e9340b', pr_url=None, pr_revision=None, pr_num=None)