# Postprocess the response data from the OpenAI API

input: `respose_file_path`
output: `annotated_file_path`

Description
- make the annotated dataset from the response from the OpenAI API.

In [1]:
import pandas as pd
from datasets import Features, ClassLabel, Dataset, DatasetDict
import json
import re

In [2]:
respose_file_path = r'../local_data/api_responses_for_annotated_dataset.jsonl'

In [3]:
in_fname =  respose_file_path
datas = []
request_ids = []
# results = {}
with open(in_fname, 'r') as f:
    for i, line in enumerate(f):
        data = json.loads(line)
        if isinstance(data[1] , dict):
            # results[i] = data
            datas.append(data[1]['choices'][0]['message']['content'])
            request_ids.append(data[2]['request_id'])

In [6]:
df = pd.DataFrame({'request_id': request_ids, 'response': datas})
df['response'] = df['response'].str.split('\n')
df.sort_values(by='request_id', inplace=True)

# response 컬럼을 펼쳐서 새로운 DataFrame 생성
exploded_df = df.explode('response').reset_index(drop=True)

# "->" 가 포함된 response만 남기기 (한건 제외됨)
exploded_df = exploded_df[exploded_df['response'].str.contains('->')]

In [7]:
# 문자열을 분리하는 함수 정의
def split_response(response):
    # 제목과 나머지를 분리
    title, rest = response.split("->")
    title = title.strip()
    
    # 카테고리와 키워드를 분리
    category, *keywords = rest.split(":")
    category = category.strip()
    keywords = [keyword.strip() for keyword in keywords]
    
    return title, category, keywords


In [8]:
# DataFrame에 새로운 컬럼 추가
exploded_df[['title', 'category', 'keywords']] = exploded_df['response'].apply(lambda x: pd.Series(split_response(x)))


In [9]:
# 필요 없는 response 컬럼 제거
exploded_df = exploded_df.drop(columns=['response'])

# request_id를 'tester'와 'trial_idx', 'sub_idx'로 분리
exploded_df[['tester', 'trial_idx', 'sub_idx']] = exploded_df['request_id'].str.split(':', expand=True)

# trial_idx를 숫자형으로 변환
exploded_df['trial_idx'] = exploded_df['trial_idx'].astype(float).astype(int)

# sub_idx를 숫자형으로 변환
exploded_df['sub_idx'] = exploded_df['sub_idx'].astype(int)

# 필요 없는 request_id 컬럼 제거
exploded_df = exploded_df.drop(columns=['request_id'])


In [10]:
exploded_df.to_csv('../local_data/annotated_dataset.csv', index=False)