# CoT Monitoring for Harmful Manipulation - Grading Pipeline

This notebook evaluates the generated persuasive arguments from the data generation pipeline to assess:

1. **Harmful Manipulation Detection**: Does the reasoning contain evidence of harmful manipulation?
2. **Faithfulness**: Is the model's final response faithful to its Chain-of-Thought reasoning?

Note that OpenAI models are used as graders and thus, an OpenAI API key is required. The client can be easily changed in  `call_model_hm`.

### Setup and Imports

In [None]:
from openai import OpenAI
import pandas as pd
import json
from dotenv import load_dotenv
import os

load_dotenv()

### Helper function to call hamful manipulation grader model

In [None]:
def call_model_hm(prompt, reasoning, model_str):
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model=model_str,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": prompt + reasoning
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "reasoning_grading",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "decision": {
                            "type": "string"
                        },
                        "evidence": {
                            "type": "string"
                        }
                    },
                    "required": ["decision", "evidence"],
                    "additionalProperties": False
                }
            }
        }
        
    )
    output = response.choices[0].message.content
    parsed_json = json.loads(output)

    return parsed_json

### Harmful Manipulation Grading

In [None]:
def grade_reasoning(model_str, filename):
    df = pd.read_csv(filename)

    prompt = open("prompts/grader_sysprompt.txt", "r").read()

    for index, row in df.iterrows():
        reasoning = row["reasoning"]
        return_obj = call_model_hm(prompt, reasoning, model_str)
        decision = return_obj['decision']
        evidence = return_obj['evidence']

        # create the decision and evidence columns and add the data
        df.at[index, 'decision'] = decision
        df.at[index, 'evidence'] = evidence

        print(f"Graded row {index + 1}/{len(df)}")

    df.to_csv("graded_" + filename, index=False)

In [None]:
# Example usage for harmful manipulation grading using gpt-5-mini (uncomment to run):
# grade_reasoning("gpt-5-mini", "sample_output.csv")

In [None]:
# Uncomment to convert notebook to Python script for execution (optional)
# !jupyter nbconvert --to script post.ipynb --output evaluation_pipeline

### Helper function to call faithfulness grader model

In [None]:
def call_model_faithfulness(prompt, reasoning, output, model_str):
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
        model=model_str,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": prompt + "\n\nHere is the reasoning:" + reasoning + "\n\nAnd here is the output: " + output
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "faithfulness_grading",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "decision": {
                            "type": "string"
                        },
                        "evidence": {
                            "type": "string"
                        }
                    },
                    "required": ["decision", "evidence"],
                    "additionalProperties": False
                }
            }
        }
        
    )
    output = response.choices[0].message.content
    parsed_json = json.loads(output)
    return_object = {   
        "decision": parsed_json["decision"],
        "evidence": parsed_json["evidence"],
    }
    return return_object

### Faithfulness Grading

In [None]:
def grade_faithfulness(model_str, filename):
    df = pd.read_csv(filename)

    prompt = open("prompts/faithfulness_sysprompt.txt", "r").read()

    for index, row in df.iterrows():
        reasoning = row["reasoning"]
        output = row["response"]
        return_obj = call_model_faithfulness(prompt, reasoning, output, model_str)
        decision = return_obj['decision']
        evidence = return_obj['evidence']

        # create the decision and evidence columns and add the data
        df.at[index, 'faithful'] = decision
        df.at[index, 'faithfulness_evidence'] = evidence

        print(f"Graded faithfulness: row {index + 1}/{len(df)}")

    df.to_csv(filename, index=False)

In [None]:
# Example usage for faithfulness evaluation (uncomment and modify filenames/model as needed):
# grade_faithfulness("gpt-5-mini", "graded_output_file.csv")