## Setting Envirionment

In [None]:
# ! pip install pandas
# ! pip install openai
# ! pip install langchain

In [None]:
import os
import json
import openai
import pandas as pd
import re
import time

In [None]:
with open('openai_api_key.txt', 'r') as f:
    openai.api_key = f.read().strip()
os.environ["OPENAI_API_KEY"] = openai.api_key

# os.environ.get("OPENAI_API_KEY") # api_key check if needed

## Preparing required files

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage

chat = ChatOpenAI(model="gpt-4-0613", temperature = 0)

In [None]:
# call prompts
with open('ktype_prompt.txt', 'r') as fk:
    kt_prompt = fk.read()
with open('ctype_prompt.txt', 'r') as fc:
    ct_prompt = fc.read()
with open('qtype_prompt.txt', 'r') as fq:
    qt_prompt = fq.read()

In [None]:
# call intermediate file 
interm_FileName = "CAUS" 
with open(f'_output_intermediate/intermediate_{interm_FileName}.json', 'r') as file:
    intermediate_json = json.load(file)
print(len(intermediate_json))

## Loop for KCQ-typing from the intermediate file

In [None]:
def process_intermediate(intermediate_data, prompt, key_name):
    intermediate_results = []
    
    for item in intermediate_data:
        row_dict = item['row_dict']
        query_value = item['query_value']

        messages = [SystemMessage(content=f"'{prompt}'"), HumanMessage(content=f"'{query_value}'")]
        response_str = chat(messages).content
        matches = re.findall(r"\(([^)]+)\)", response_str)
        response_tuples = [tuple(map(lambda x: x.strip().strip("'"), match.split(','))) for match in matches]

        intermediate_results.append({
            "row_dict": row_dict,
            "query_value": query_value,
            key_name: response_tuples
        })

        # Sleep for 0.8 seconds after processing each item
        time.sleep(0.8)
    
    return intermediate_results

# Process ktype, ctype, and qtype for each query_value
ktype_intermediate = process_intermediate(intermediate_json, kt_prompt, "ktype")
ctype_intermediate = process_intermediate(intermediate_json, ct_prompt, "ctype")
qtype_intermediate = process_intermediate(intermediate_json, qt_prompt, "qtype")



In [None]:
# Create a list to store the final results
output = []

# Assume qtype_intermediate and ktype_intermediate have the same length
for k_item, c_item, q_item, int_item in zip(ktype_intermediate, ctype_intermediate, qtype_intermediate, intermediate_json):
    # Check if the 'row_dict' values of each item are the same
    assert k_item['row_dict'] == c_item['row_dict'] == q_item['row_dict'] == int_item['row_dict'], "row_dict values do not match!"

    row_dict = q_item['row_dict']
    reasoning = int_item['reasoning']  
    questions_list = []
    
    for idx, (query, ktype_tuple, ctype_tuple, qtype_tuple) in enumerate(zip(q_item['query_value'], k_item['ktype'], c_item['ctype'], q_item['qtype']), start=1):

        ktype_num, ktype_text = ktype_tuple
        ctype_num, ctype_text = ctype_tuple
        qtype_num, qtype_text = qtype_tuple
                
        question_dict = {
            f"qid{idx:02}": f"{row_dict['scn_id']}Q{idx:02}",
            f"query{idx:02}": query,
            f"ktype_num{idx:02}": ktype_num,
            f"ktype{idx:02}": ktype_text,
            f"ctype_num{idx:02}": ctype_num,
            f"ctype{idx:02}": ctype_text,
            f"qtype_num{idx:02}": qtype_num,
            f"qtype{idx:02}": qtype_text
        }
        
        questions_list.append(question_dict)
    
    output_dict = {
        "scn_id": row_dict["scn_id"],
        "scn_cls": row_dict["scn_cls"],
        "scn_sentence": row_dict["scn_sentence"],
        "reasoning": reasoning, 
        "question": questions_list
    }

    output.append(output_dict)

## Saving files

### Saving output to JSON

In [None]:
# Save the JSON output to a file
import datetime
now = datetime.datetime.now()
nowdate = now.strftime("%y%m%d_%H%M%S")

with open(f'_output_result/result_{interm_FileName}_{nowdate}_35turbo.json', 'w') as json_file:
    json.dump(output, json_file)

### Saving CSV with qid and qtypes of each question

In [None]:
transformed_output = []

for entry in output:
        new_dict = entry.copy() # Create a copy of the existing dictionary  
        questions = new_dict.pop('question') # Extract the "question" item
        for q_dict in questions:
            new_dict.update(q_dict) # Move items from each sub-dictionary to the parent dictionary

        transformed_output.append(new_dict) # Add the modified dictionaries to a new list

# Create a DataFrame using the new list
df_output = pd.DataFrame(transformed_output)

# Save the DataFrame to a CSV file
df_output.to_csv(f'_output_result/result_{interm_FileName}_for35turbo_{nowdate}_35turbo.csv', index=False)