In [1]:
import json
import pandas as pd

def process_CoT_raw_response_df(df):
    processed_data = []
    
    for idx, row in df.iterrows():
        user_input = row['user_input']
        raw_response = row['raw_response']
        retrieved_contexts = row['retrieved_contexts']
        
        # Clean the raw response string
        clean_response = raw_response.replace("\n", "").replace("\r", "").replace("\t", "").strip()
        
        try:
            # Try to parse as JSON directly first
            if clean_response.startswith("```json"):
                # Remove ```json and ``` markers
                json_str = clean_response.replace("```json", "").replace("```", "").strip()
                json_data = json.loads(json_str)
            else:
                json_data = json.loads(clean_response)
            thought = json_data["thought"]
            response = json_data["answer"]
            
        except json.JSONDecodeError:
            # If direct parsing fails, try to extract JSON structure
            try:
                # Find the last occurrence of {"thought"
                thought_start = clean_response.rindex('{"thought"')
                # Find the matching closing brace
                brace_count = 0
                for i in range(thought_start, len(clean_response)):
                    if clean_response[i] == '{':
                        brace_count += 1
                    elif clean_response[i] == '}':
                        brace_count -= 1
                        if brace_count == 0:
                            json_str = clean_response[thought_start:i+1]
                            break
                
                json_data = json.loads(json_str)
                thought = json_data["thought"]
                response = json_data["answer"]
                
            except (ValueError, json.JSONDecodeError, KeyError) as e:
                print(f"Failed to parse JSON at index {idx}")
                print(f"Raw response: {raw_response}")
                # Skip this row or add placeholder values
                thought = "ERROR: Failed to parse thought"
                response = "ERROR: Failed to parse response"
        
        processed_data.append([user_input, thought, response, retrieved_contexts])
    
    return pd.DataFrame(processed_data, columns=["user_input", "thought", "response", "retrieved_contexts"])

In [None]:
dataset_name = 'contractnli'
model_name = 'llama3'
k = 1

query_answer_CoT_raw_from_file = pd.read_json(f'query_cot/{dataset_name}/query_answer_CoT_raw_{model_name}_k{k}.json')
query_answer_CoT = process_CoT_raw_response_df(query_answer_CoT_raw_from_file)
query_answer_CoT.to_json(f'query_answer_CoT_{model_name}_k{k}.json', orient="records", indent=4)

Failed to parse JSON at index 35
Raw response: {
  "thought": "First, we need to identify the key terms in the question and context. The question asks about the Non-Disclosure Agreement between DoiT and ICN, and whether it restricts the use of Confidential Information for specific purposes. We should focus on the relevant parts of the context to answer this question.",
  "answer": "Based on the provided context, the Non-Disclosure Agreement between DoiT and ICN does restrict the use of Confidential Information to the purposes stated in the Agreement. The document allows exceptions for Confidential Information that is independently developed by Vendor, becomes publicly available without breach, was known to be free of restriction at the time of disclosure to Vendor, or was lawfully received free of restriction from another source having the right to so furnish such Confidential Information."}
}
Failed to parse JSON at index 55
Raw response: {"thought": "First, let's identify the key ter

# Updated

In [66]:
# Re-import necessary libraries after code execution environment reset
import json
import pandas as pd
import re

# Fixer: handles greedy quote cleaning
def fix_stray_inner_quotes_greedy(json_str):
    """
    Fix stray inner double quotes inside long JSON string values
    by greedily matching from first opening quote to the last closing quote.
    """
    pattern = re.compile(r'("answer":\s*")((?:\\.|[^\\])*?)("})', re.DOTALL)

    def replacer(match):
        key = match.group(1)
        value = match.group(2)
        suffix = match.group(3)
        cleaned = re.sub(r'(?<!\\)"', '', value)
        return f'{key}{cleaned}{suffix}'

    return pattern.sub(replacer, json_str)

# Robust JSON parser
def robust_json_parse(raw_response):
    try:
        if isinstance(raw_response, dict):
            return raw_response

        return json.loads(raw_response)
    except Exception:
        raw = raw_response.strip()

        # Clean triple backticks, trailing braces, stray quotes
        if raw.startswith(("'", '"')) and raw.endswith(("'", '"')):
            raw = raw[1:-1]

        raw = re.sub(r'(?:\s*[`\]\}])+\s*$', '', raw)
        raw = raw.replace("\\'", "'").replace('\\"', '"')
        raw = raw.replace('“', '"').replace('”', '"')
        raw = raw.replace('‘', "'").replace('’', "'")

        # Fix the specific issue with stray quotes inside long values
        raw = fix_stray_inner_quotes_greedy(raw)

        # Ensure valid closing brace
        brace_index = raw.rfind('}') + 1
        if brace_index > 0:
            raw = raw[:brace_index]
        elif raw.startswith('{') and not raw.endswith('}'):
            raw += '}'

        try:
            return json.loads(raw)
        except Exception as final_e:
            raise ValueError(f"Failed to parse after fix: {final_e}\n--- Raw Input ---\n{raw_response}")

# Main DataFrame processor
def process_CoT_raw_response_df(df):
    processed_data = []

    for idx, row in df.iterrows():
        user_input = row['user_input']
        raw_response = row['raw_response']
        retrieved_contexts = row['retrieved_contexts']

        clean_response = raw_response.replace("\n", "").replace("\r", "").replace("\t", "").strip()

        try:
            json_data = robust_json_parse(clean_response)
            thought = json_data["thought"]
            response = json_data["answer"]
        except Exception:
            thought = "ERROR: Failed to parse thought"
            response = "ERROR: Failed to parse response"

        processed_data.append([user_input, thought, response, retrieved_contexts])

    return pd.DataFrame(processed_data, columns=["user_input", "thought", "response", "retrieved_contexts"])

# Re-load the uploaded file and process
dataset_name = 'privacy_qa'
k_numbers = [1, 3, 5, 10]

for k in k_numbers:
    query_answer_CoT_raw_from_file = pd.read_json(f'query_cot/{dataset_name}/query_answer_CoT_raw_llama3_k{k}.json')
    query_answer_CoT_cleaned = process_CoT_raw_response_df(query_answer_CoT_raw_from_file)
    query_answer_CoT_cleaned.to_json(f'query_cot_cleaned/{dataset_name}/query_answer_CoT_llama3_k{k}.json', orient="records", indent=4)
    print(f"process done for {dataset_name} of llama3 with k={k}", "\n", "="*100)

process done for privacy_qa of llama3 with k=1 
process done for privacy_qa of llama3 with k=3 
process done for privacy_qa of llama3 with k=5 
process done for privacy_qa of llama3 with k=10 
