In [10]:
import pandas as pd
import json

In [2]:
def process_CoT_raw_response_df(df):
    processed_data = []
    
    for idx, row in df.iterrows():
        user_input = row['user_input']
        raw_response = row['raw_response']
        retrieved_contexts = row['retrieved_contexts']
        
        # Clean the raw response string
        clean_response = raw_response.replace("\n", "").replace("\r", "").replace("\t", "").strip()
        
        try:
            # Try to parse as JSON directly first
            if clean_response.startswith("```json"):
                # Remove ```json and ``` markers
                json_str = clean_response.replace("```json", "").replace("```", "").strip()
                json_data = json.loads(json_str)
            else:
                json_data = json.loads(clean_response)
            thought = json_data["thought"]
            response = json_data["answer"]
            
        except json.JSONDecodeError:
            # If direct parsing fails, try to extract JSON structure
            try:
                # Find the last occurrence of {"thought"
                thought_start = clean_response.rindex('{"thought"')
                # Find the matching closing brace
                brace_count = 0
                for i in range(thought_start, len(clean_response)):
                    if clean_response[i] == '{':
                        brace_count += 1
                    elif clean_response[i] == '}':
                        brace_count -= 1
                        if brace_count == 0:
                            json_str = clean_response[thought_start:i+1]
                            break
                
                json_data = json.loads(json_str)
                thought = json_data["thought"]
                response = json_data["answer"]
                
            except (ValueError, json.JSONDecodeError, KeyError) as e:
                print(f"Failed to parse JSON at index {idx}")
                print(f"Raw response: {raw_response}")
                # Skip this row or add placeholder values
                thought = "ERROR: Failed to parse thought"
                response = "ERROR: Failed to parse response"
        
        processed_data.append([user_input, thought, response, retrieved_contexts])
    
    return pd.DataFrame(processed_data, columns=["user_input", "thought", "response", "retrieved_contexts"])

In [3]:
def clean_CoT_json_output(k_values, model_name, folder_path):
    for k in k_values:
        try:
            input_file = f'{folder_path}/query_answer_CoT_raw_{model_name}_k{k}.json'
            output_file = f'{folder_path}/query_answer_CoT_{model_name}_k{k}.json'
            
            query_answer_CoT_raw_from_file = pd.read_json(input_file)
            query_answer_CoT = process_CoT_raw_response_df(query_answer_CoT_raw_from_file)
            query_answer_CoT.to_json(output_file, orient="records", indent=4)
        except FileNotFoundError:
            print(f"File {input_file} not found")
        except ValueError as e:
            print(f"Error processing file for {model_name} k={k}: {str(e)}")
        except Exception as e:
            print(f"Unexpected error processing file for {model_name} k={k}: {str(e)}")

In [11]:
clean_CoT_json_output([1, 3, 5, 10], "llama3", "./data/json_output/ZZZ_llama_cot/contractnli")

Failed to parse JSON at index 35
Raw response: {
  "thought": "First, we need to identify the key terms in the question and context. The question asks about the Non-Disclosure Agreement between DoiT and ICN, and whether it restricts the use of Confidential Information for specific purposes. We should focus on the relevant parts of the context to answer this question.",
  "answer": "Based on the provided context, the Non-Disclosure Agreement between DoiT and ICN does restrict the use of Confidential Information to the purposes stated in the Agreement. The document allows exceptions for Confidential Information that is independently developed by Vendor, becomes publicly available without breach, was known to be free of restriction at the time of disclosure to Vendor, or was lawfully received free of restriction from another source having the right to so furnish such Confidential Information."}
}
Failed to parse JSON at index 55
Raw response: {"thought": "First, let's identify the key ter