In [None]:
import json

def process_questions(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    processed_lines = []

    for line in lines:
        data = json.loads(line)
        question_id = data['question_id']
        category = data['category']
        turns = data['turns']

        # 1つ目の要素をそのままturnに設定
        processed_lines.append({
            'question_id': question_id,
            'category': category,
            'turn': turns[0]
        })

        # 2つ目の要素を加工して新たなレコードを追加
        additional_turn = (
            f'あなたは先ほど以下の質問に対してすでに回答済みです。これに関して追加の以下の質問に答えてください\n'
            f'あなたが回答済みの質問：\n{turns[0]}\n\n'
            f'追加の質問：\n{turns[1]}'
        )
        processed_lines.append({
            'question_id': question_id,
            'category': category,
            'turn': additional_turn
        })

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(json.dumps(line, ensure_ascii=False) + '\n')

# 使用例
process_questions('./mt_bench/question_full.jsonl', './mt_bench/question_processed.jsonl')

In [None]:
import json

def split_turns(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    split_lines = []

    for line in lines:
        data = json.loads(line)
        question_id = data['question_id']
        answer_id = data['answer_id']
        model_id = data['model_id']
        choices = data['choices']

        for choice in choices:
            turns = choice['turns']
            for i, turn in enumerate(turns):
                split_lines.append({
                    'question_id': question_id,
                    'answer_id': answer_id,
                    'model_id': model_id,
                    'choice_index': choice['index'],
                    'turn_index': i,
                    'turn': turn
                })

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in split_lines:
            f.write(json.dumps(line, ensure_ascii=False) + '\n')

# 使用例
split_turns('./mt_bench/gpt-4.jsonl', './mt_bench/gpt-4_split.jsonl')

In [None]:
import json
import csv

def jsonl_to_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    data = [json.loads(line) for line in lines]

    # CSVのヘッダーを取得
    headers = data[0].keys()

    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        writer.writerows(data)

# gpt-4_split.jsonl を CSV に変換
jsonl_to_csv('./mt_bench/gpt-4_split.jsonl', './mt_bench/gpt-4_split.csv')

# question_processed.jsonl を CSV に変換
jsonl_to_csv('./mt_bench/question_processed.jsonl', './mt_bench/question_processed.csv')

In [None]:
!pip install huggingface_hub ipywidgets

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [3]:
from datasets import load_dataset, DatasetDict

# test.csv ファイルを読み込む
dataset = load_dataset('csv', data_files='./eval_tasks/test.csv')

# DatasetDictにtestスプリットを設定
dataset_dict = DatasetDict({'test': dataset['train']})

# 確認のためにデータセットを表示
print(dataset_dict)

DatasetDict({
    test: Dataset({
        features: ['input', 'output', 'eval_aspect'],
        num_rows: 260
    })
})


In [4]:
# データセットをHuggingFaceにプッシュ
dataset.push_to_hub("HODACHI/MERGE_tasks-300")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/HODACHI/MERGE_tasks-300/commit/1d8982943a75a1e4ab07b5eb25bb3061a7793142', commit_message='Upload dataset', commit_description='', oid='1d8982943a75a1e4ab07b5eb25bb3061a7793142', pr_url=None, pr_revision=None, pr_num=None)