# Validating the use of LLMs for psychological text classification

This Python notebook provides the materials to replicate the analysis of the article called "Validating the use of large language models for psychological text classification".

CONTENTS
1. Set up of notebook (installing all the necessary packages from the notebook, setting the API keys for use with OpenAI's GPT). 
2. Custom functions (defining of any functions necessary for the classifications and subsequent analyses).
3. Classifying reported speech (classification 1: extraction of reported speech from diary excerpts).
4. Classifying other-initiated repairs (classification 2: binary classification of other-initiated repairs in Reddit dialogues; for the purposes of prompting, these are called 'clarification requests' in the prompt).
5. Classifying harm (classification 3: the ordinal classification of harm reported in healthcare complaints submitted to hospitals).

In order to run this notebook, you will need the original data. Due to its sensitive nature, this is not publicly available. Yet, it is available upon request. Please email the first author, Hannah Bunt at h.l.bunt@lse.ac.uk 

# 1. Set up

In [1]:
# For general processing of data
import numpy as np
import pandas as pd
import time
import re
import os
# For fuzzy matching (reported speech)
from thefuzz import fuzz
# For plotting:
import matplotlib.pyplot as plt
# For generating classification reports and accuracy statistics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
# For progress bars
from tqdm import tqdm
from IPython.core.display import HTML
#For using the OpenAI API and accessing GPT models
import openai

In [2]:
# Load in openai keys for producing topic model names
oai_k = ""
openai.organization = ""
openai.api_key = oai_k
os.environ['OPENAI_API_KEY'] = oai_k

# 2. Custom functions

In [2]:
def get_llm_response(messages, model = "gpt-4o", temperature=0, max_tokens = 1500, max_attempts = 3):
  '''
  Function that takes messages format for ChatGPT input and returns the response text.
  '''

  for attempt in range(0, max_attempts):
    try:
      #. request timeout ADD IN
      response = openai.chat.completions.create(model=model, messages = messages, temperature=temperature, max_tokens=max_tokens, timeout = 120)
      response_text = response.choices[0].message.content
      break  # If analysis was successful, break out of the retry loop
    except Exception as e:
      print(f"Error processing text on attempt {attempt+1}: {e}")
      if attempt + 1 == max_attempts:
        print(f"Skipping text after {max_attempts} failed attempts.")
        response_text
  return response_text

def define_messages(prompt, role, text_to_classify):
  '''
  Function for creating a basic messages format from a prompt, a role, and a text to classify (all strings)
  '''
  prompt = prompt.format(text_to_classify)
  messages = [{'role': 'system', 'content': role},
              {'role': 'user', 'content' : prompt}]
  return messages

def run_classifier(prompt, role, input_texts, model = "gpt-4o"):
  '''
  Function for running a prompt over a series of texts (expects a list)
  '''
  scores = []
  for txt in tqdm(input_texts):
    message = define_messages(prompt, role, txt)
    try:
      response = get_llm_response(message, model)
    except:
      response = "Error in response"
    scores.append(response)
  return scores

def split_string_to_list(string):
  '''
  Function for splitting a string into a list - used for the reported speech output parsing
  '''
  regex = '\s{0,3}\d{1,3}\.\s{1,3}'
  list_of_strings = re.split(regex, string)
  list_of_strings = [s for s in list_of_strings if len(s)>0]
  return list_of_strings

def process_RS_scores(llm_scores_RS):
  '''
  This function correctly parses the llm_scores output into list of strings, removing N/A values.
  '''
  llm_scores2 = [split_string_to_list(score) for score in llm_scores_RS]
  llm_scores3 = []
  for list_ in llm_scores2:
    out_list = []
    for rs in list_:
      if rs == "N/A":
        out_list.append("")
      else:
        rs = rs.replace('"','')
        out_list.append(rs)
    llm_scores3.append(out_list)
  return llm_scores3

def convert_llm_scores_binary(llm_scores):
  '''
  Function for converting a string "Yes" or "No" into binary format - used for the clarification requests
  '''
  new_scores = []
  for s in llm_scores:
    if "yes" in s.lower():
      new_scores.append(1)
    else:
      new_scores.append(0)
  return new_scores

def convert_llm_scores_Harm(llm_scores):
  '''
  Function for converting the Harm scores
  '''
  new_scores = []
  for s in llm_scores:
    s = s.lower().replace(' ', '').strip()
    if "noharm" in s:
      new_scores.append(1)
    elif "minimal" in s:
      new_scores.append(2)
    elif "minor" in s:
      new_scores.append(3)
    elif "moderate" in s:
      new_scores.append(4)
    elif "major" in s:
      new_scores.append(5)
    elif "catastrophic" in s:
      new_scores.append(6)
    elif "category1" in s:
      new_scores.append(1)
    elif "category2" in s:
      new_scores.append(2)
    elif "category3" in s:
      new_scores.append(3)
    elif "category4" in s:
      new_scores.append(4)
    elif "category5" in s:
      new_scores.append(5)
    elif "category6" in s:
      new_scores.append(6)
    else:
      new_scores.append(1)
  return new_scores

def display_confusion_matrix(human_labels, predicted_labels):
  '''
  Function for displaying a confusion matrix from results
  '''
  conf_mx = confusion_matrix(human_labels, predicted_labels)
  disp = ConfusionMatrixDisplay(confusion_matrix=conf_mx)
  disp.plot()

In [6]:
# functions for fuzzy matching text extraction

def get_fuzzScore(human_quote, llm_quote, threshold = 85):
  '''
  Function for fuzzy matching extracted quotes
  '''
  if human_quote == llm_quote:
    return "human_positive", 100
  score = fuzz.ratio(human_quote, llm_quote)
  if score>threshold:
    return "true_positive" , score
  else:
    return "_unsure_", score

def get_fuzzScoreForList(human_quote, llm_quotes, threshold = 85, debug = False):
  '''
  Does a list of llm_quotes contain a fuzzy match to the human_quote?
  '''
  for llm_quote in llm_quotes:
    llm_quote = str(llm_quote)
    out, score = get_fuzzScore(human_quote, llm_quote, threshold = threshold)
    if out == "true_positive":
      if debug:
        bug_print =  f'Fuzzy match = {score}: \nHuman = "{human_quote}"\nGPT = "{llm_quote}"\n{"-"*40}'
        return out, bug_print
      else:
        return out
  # if no return by this point
  if debug:
    return "_unsure_", ""
  return "_unsure_"

def get_strongMatches(human_quote, llm_quotes, human_noQuoteCode = "", llm_noQuoteCode =""):

  # make sure quotes have no duplicates
  llm_quotes = list(set(llm_quotes))
  # if both human and llm find nothing
  if len(llm_quotes) == 0:
    if str(human_quote)==human_noQuoteCode:
        return 'true_negative'
    # if human finds nothing but llm finds something
    if str(human_quote)==human_noQuoteCode:
        return 'false_positive'
    # if human finds something but llm finds nothing
    if str(human_quote)!=human_noQuoteCode:
        return 'false_negative'
  elif len(llm_quotes) == 1:
    # Check not just list of no quotes
    if str(llm_quotes[0]) == llm_noQuoteCode:
      if str(human_quote) == human_noQuoteCode:
        return "true_negative"
      else:
        return "false_negative"
    else:
      if str(human_quote) == human_noQuoteCode:
        return "false_negative"
      else:
        return "_unsure_"
  else:
    return "_unsure_"

def get_weakMatches(human_quote, llm_quotes, threshold = 85, debug = False):
  # WEAKER SUBSET MATCHES
  # is one of the quotes a subset of the other?
  if len(human_quote)>10: # this prevents a very short string (eg 'a') being a subset - raise to be more cautious
    for llm_quote in llm_quotes:
      llm_quote = str(llm_quote)
      if len(llm_quote)>10: # again, preventing error of very short string being a subset
        score = fuzz.partial_ratio(human_quote, llm_quote)
        if score>threshold:
          if debug:
            bug_print = f'Subset match = {score}: \nHuman = "{human_quote}"\nGPT = "{llm_quote}"\n{"-"*40}'
            return 'true_positive', bug_print
          return 'true_positive'
  # if no return by this point
  if debug:
    return "false_positive", ""
  return "false_positive"

# DEFINE THE MAIN FUNCTION
def isQuoteInList(human_quote, llm_quotes, human_noQuoteCode = "", llm_noQuoteCode ="",
                     threshold=85, debug=False):

  '''
  function to check if the human_quote is in the list of llm_quotes
  raise the 'threshold' to be more cautious
  '''

  # Run strong matches function
  accuracy_score = get_strongMatches(human_quote, llm_quotes, human_noQuoteCode, llm_noQuoteCode)
  if accuracy_score != "_unsure_":
    matchStrength = "strong"
    return accuracy_score, matchStrength

  # if no viable score, run fuzzy match function
  if debug:
    accuracy_score, bug_print = get_fuzzScoreForList(human_quote, llm_quotes, threshold, debug)
    print(bug_print)
  else:
    accuracy_score = get_fuzzScoreForList(human_quote, llm_quotes, threshold, debug)
  if accuracy_score != "_unsure_":
    matchStrength = "fuzzy"
    return accuracy_score, matchStrength

  # if no viable score, run weak match function
  if debug:
    accuracy_score, bug_print = get_weakMatches(human_quote, llm_quotes, threshold, debug)
    print(bug_print)
  else:
    accuracy_score = get_weakMatches(human_quote, llm_quotes, threshold, debug)
  matchStrength = "subset"
  return accuracy_score, matchStrength

# Get classification report and display confusion matrix for reported speech
def classification_report_RS(df, scoreColumn):
  true_labels, predicted_labels = [],[]
  for i, row in df.iterrows():
    score = row[scoreColumn]
    if score == "true_positive":
      true_labels.append(1)
      predicted_labels.append(1)
    elif score == "true_negative":
      true_labels.append(0)
      predicted_labels.append(0)
    elif score == "false_positive":
      true_labels.append(0)
      predicted_labels.append(1)
    elif score == "false_negative":
      true_labels.append(1)
      predicted_labels.append(0)
  print(classification_report(true_labels, predicted_labels))
  display_confusion_matrix(true_labels, predicted_labels)

# 3. Classifying reported speech

## Prompts

In [8]:
role_RS = "You are a helpful research assistant that identifies reported speech."

min_definition_RS ="""
DEFINITION:
Reported speech can be direct or indirect.

"""

max_definition_RS = """
DEFINITION:
I want to teach you how to do a 'reported speech analysis' on diary entries. Reported speech is how we represent the speech of other people or what we ourselves say. There are two main types of reported speech: direct speech and indirect speech.
Direct speech repeats the exact words the person used, or how we remember their words.
Indirect speech reports the original speaker’s words are changed. Indirect speech focuses more on the content of what someone said rather than their exact words.
Speech reports consist of two parts: the reporting clause and the reported clause. The reporting clause includes a verb such as 'say', 'tell', 'ask', 'reply', 'shout', 'assert', 'whisper', 'write', 'type'. The reported clause includes what the original speaker said.
Sometimes, objects are metaphorically used when someone says or has said something. This is metonymy and should also be considered 'reported speech'.

"""

examples_RS = """
EXAMPLES:
Example of direct speech: “I’m sorry,” said Mark.
Example of indirect speech: Mark apologised.
Example of direct speech: Barbara said, “I didn’t realise it was midnight.”
Example of indirect speech: Barbara said she hadn’t realised it was midnight.
Example of metonymy: I was contacted by the appliance store, the blower wheel had been delivered.
Explanation: Someone from the appliance store called the narrator saying that the blower wheel had been delivered.
Example of metonymy: Pecans are in great demand this year according to a NPR story.
Explanation: The NPR story, even though is not a person, reports that pecans are in great demand this year.
Example of metonymy: A private note asked for my perspective on certain aspects of the gay community.
Explanation: The private note 'asked' for the narrator's perspective on certain aspects of the gay community.

"""

suffix_RS = """
INSTRUCTION:
Is there reported speech in the diary entry? If there is reported speech, provide the exact quote from the text containing the reporting clause and reported clause.
Include all the instances of reported speech and number them.
Only respond with the instances of reported speech.
If there is no reported speech in the input text, return 'N/A'.

DIARY ENTRY:
```{}```
"""

prompt_RS_minZero = "\n".join([min_definition_RS, suffix_RS])
prompt_RS_maxZero = "\n".join([max_definition_RS, suffix_RS])
prompt_RS_maxFew = "\n".join([max_definition_RS, examples_RS, suffix_RS])


## Load data

Data available upon request. 

In [10]:
# Convert input texts to a list
input_texts_RS = df_RS["Text"].to_list()

## Extract reported speech

In [None]:
llm_scores_RS = run_classifier(prompt_RS_maxFew, role_RS, input_texts_RS, model = "gpt-4o")

In [None]:
llm_scores_RS2 = process_RS_scores(llm_scores_RS)

In [13]:
# Process llm_scores output - makes into list and cleans up the strings
df_RS['llm_hits'] = llm_scores_RS2
# split strings from Manual_span
df_RS['human'] = df_RS['Manual_span'].apply(split_string_to_list)
# explode the human quotes - so there is one true human quote per row
df_RS_expl = df_RS.explode('human').reset_index(drop=True)
# Replace the values '-' with empty value
df_RS_expl = df_RS_expl.replace('-', "")
# categorise the matches
results_RS = df_RS_expl.apply(lambda x: isQuoteInList(x['human'], x['llm_hits']), axis=1)
df_RS_expl[['llm_match', 'matchStrength']] = pd.DataFrame(results_RS.tolist(), index=df_RS_expl.index)

## Predictive validity test

In [None]:
# True/False - has reported speech
def decision_function(x, cutoff = 6):
  if x == "['']":
    return False
  elif x == "['-']":
    return False
  elif len(x) <= cutoff:
    return False
  else:
    return True

llm_trueFalse_RS = [decision_function(x) for x in df_RS["llm_hits"].astype(str).to_list() ]
human_trueFalse_RS = [decision_function(x) for x in df_RS["human"].astype(str).to_list() ]


print(classification_report(human_trueFalse_RS, llm_trueFalse_RS))
display_confusion_matrix(human_trueFalse_RS, llm_trueFalse_RS)

## Qualitative content validity checks
Table below is built on exploded dataframe; the confusion matrix above uses the original dataframe. As a result, there might be more false positives and false negatives in the tables below than in the confusion matrix. 

In [None]:
df_RS_out = df_RS_expl[ ['Text', 'human', 'llm_hits', 'llm_match']].copy()
df_RS_out['llm_hits'] = df_RS_out['llm_hits'].apply(lambda x: '<br><br>'.join(x))

In [None]:
# investigate false negatives
df_fp = df_RS_out[df_RS_out['llm_match']=='false_negative'].copy()
# add title
df_fp = df_fp.style.set_caption(f"Reported Speech: False Negatives (n={len(df_fp)})")
# display
with pd.option_context('display.max_colwidth', 0):
    display(HTML(df_fp.to_html(escape=False)))

In [None]:
# investigate false positives
df_fn = df_RS_out[df_RS_out['llm_match']=='false_positive'].copy()
# add title
df_fn = df_fn.style.set_caption(f"Reported Speech: False Positives (n={len(df_fn)})")
# display
with pd.option_context('display.max_colwidth', 0):
    display(HTML(df_fn.to_html(escape=False)))

# 4. Classifying other-initiated repairs

## Prompts

In [8]:
role_CR = """
You are a helpful research assistant that identifies clarification requests.

"""

min_definition_CR = """
DEFINITION:
Clarification requests are questions or statements designed to address problems of miscommunication and misunderstanding.

"""

max_definition_CR = """
DEFINITION:
Clarification requests occur in response to unclear or ambiguous communication and aim to enhance comprehension and avoid misunderstandings.
They are essentially questions that help individuals gather more information or confirm their assumptions, promoting precise understanding and reducing the risk of misinterpretation.
There are four types of clarification requests:
Type 1: Direct expressions of confusion, where the individual seeks a repetition or rephrasing of the original statement.
Type 2: Targeted inquiries aimed at extracting specific elements or details within the provided information.
Type 3: Proposing a suggestion or hypothesis to confirm understanding, essentially narrowing down possibilities to affirm comprehension.
Type 4: A rhetorical question (of types 1-3) that can be answered as if it were a clarification request.

"""

examples_CR = """
EXAMPLES
----
SPEAKER A:
"I think I’ve seen this plotline somewhere before”

SPEAKER B:
"Are you thinking of 'Cryptonomicon' by Neal Stephenson? The main character cracks codes during World War II and there's this intricate connection to modern cryptography and technology. I remember reading it a while back and being hooked."

Expected response:
YES
----
SPEAKER A:
“Yeah this really makes me think of the Star Wars robot”

SPEAKER B:
"R2-D2 or C-3PO?"

Expected response:
YES
----
SPEAKER A:
"Title: Marvel's Multiverse Mayhem Teaser Body: "

SPEAKER B:
"Did you also catch the new DC multiverse trailer? I'm honestly on the fence about it. The multiverse concept is intriguing, but I'm curious to hear what others think. Is it living up to the hype or falling flat for you?"

Expected response:
YES
----

"""

instruction_CR = """
INSTRUCTION:
In the interaction below, does speaker B use a clarification request in their response to speaker A?
Respond with 'YES' if there is a clarification request. Respond with 'NO' if not.
Answer 'YES' if unsure.
Answer 'YES' if the clarification request is rhetorical.

INTERACTION:
```{}```
Expected response:
"""

prompt_minZero_CR = "\n".join([min_definition_CR, instruction_CR])
prompt_maxZero_CR = "\n".join([max_definition_CR, instruction_CR])
prompt_maxFew_CR = "\n".join([max_definition_CR, examples_CR, instruction_CR])

## Load data

Data is available upon request.

In [53]:
# Create the input text lists for the classifier
input_texts_CR = CR_0["text"].to_list() 
true_labels_CR = CR_0["labels"].to_list()

## Classify the interactions

In [None]:
# Choose a prompt type and get the GPT scores
llm_scores_CR = run_classifier(prompt_maxFew_CR, role_CR, input_texts_CR, model = "gpt-4o")
# Convert GPT scores to binary
llm_binary_CR = convert_llm_scores_binary(llm_scores_CR)

## Predictive validity test

In [None]:
print(classification_report(true_labels_CR, llm_binary_CR))
display_confusion_matrix(true_labels_CR, llm_binary_CR)

## Qualitative content validity checks

In [63]:
texts_CR = [x.replace('\n', '<br>') for x in input_texts_CR]
human_labels_CR = [bool(x) for x in true_labels_CR]
llm_labels_CR = [bool(x) for x in llm_binary_CR]
df_CR = pd.DataFrame({'texts': texts_CR, 'human_labels': human_labels_CR, 'llm_labels': llm_labels_CR})

In [55]:
# examine false negatives
with pd.option_context('display.max_colwidth', 0):
    # select rows
    df_fn = df_CR[(df_CR['human_labels']==True)&(df_CR['llm_labels']==False)]
    # add title
    df_fn = df_fn.style.set_caption(f"Clarification Requests: False Negatives (Human=True, LLM=False, n={len(df_fn)})")
    # display as html with line breaks
    display(HTML(df_fn.to_html(escape=False)))

In [56]:
# examine false positives
with pd.option_context('display.max_colwidth', 0):
    # select rows
    df_fp = df_CR[(df_CR['human_labels']==False)&(df_CR['llm_labels']==True)]
    # add title
    df_fp = df_fp.style.set_caption(f"Clarification Requests: False Positives. (Human=False, LLM=True, n={len(df_fp)})")
    # display as html with line breaks
    display(HTML(df_fp.to_html(escape=False)))

# 5. Classifying harm

## Complaint summary prompt

In [8]:
role_Summary = "You are an AI designed to clean text data."

prompt_Summary = """
INSTRUCTIONS:
Below is a hospital complaint.  Any identifiable information has been redacted. As a result, the complaints are difficult to read. 
Please: (1) fill in the gaps in the text with the most appropriate named entity placeholders, (2) clean the text so that it is legible, and (3) return only new text. 
COMPLAINT:
```{}```
NEW COMPLAINT:
"""

## Prompts

In [8]:
role_Harm = "Your role is to methodically evaluate the level of severity in patient complaints concerning healthcare services. Your assessment should be based solely on the direct outcomes of accidents or negligence by healthcare staff, impacting the patient's health and daily life."

min_definition_Harm = """
DEFINITION:
Severity in healthcare incidents is measured by the direct negative impact on a patient's health and daily activities, originating solely from healthcare staff's accidents or negligence. 
This ranges from no impact ('Category 1') to severe, life-altering impacts ('Category 6').

"""

max_definition_Harm = """
DEFINITION:
Severity in healthcare incidents is measured by the direct negative impact on a patient's health and daily activities, originating solely from healthcare staff's accidents or negligence. 
This ranges from no impact ('Category 1') to severe, life-altering impacts ('Category 6').

Severity Levels:
Category 1: No impact - no treatment or intervention needed.
Category 2: Minimal impact - minor treatment, no work absence.
Category 3: Minor impact - some treatment, <3 days off work, <4 days extra hospital stay.
Category 4: Moderate impact - significant treatment, 4-14 days off work, 4-15 days extra hospital stay, reportable incidents.
Category 5: Major impact - long-term health consequences, >14 days off work, >15 days extra hospital stay.
Category 6: Catastrophic impact - death, irreversible harm, large-scale patient impact.

"""

examples_Harm = """
EXAMPLES:
Category 1: No reported health impact.
Category 2: Non-consumed medication errors, minor injuries like bruises.
Category 3: Non-harmful medication errors, minor conditions like grade 1 pressure ulcers, minor physical or mental health issues without work absence.
Category 4: Harmful medication errors, moderate conditions like grade 2/3 pressure ulcers, infections, misinformation impacting patient transfer, injuries requiring some medical attention.
Category 5: Serious medication errors with severe effects, grade 4 pressure ulcers, long-term infections, surgical errors requiring additional surgery, serious injuries like loss of a limb.
Category 6: Fatal incidents, suicide, severe surgical errors like wrongful amputation, paralysis, serious assaults.

"""

instructions_Harm = """
INSTRUCTIONS:
Analyse a healthcare service complaint using these steps:
Step 1: Read the patient complaint.
Step 2: Evaluate the severity of direct negative impacts on the patient's health and daily life caused solely by hospital staff. Consider only the facts presented in the complaint.
Step 3: Categorize the severity level using the provided six categories. Only clear and unambiguous evidence should lead to categorizing an incident. 
Step 4: Respond only with the relevant severity category (e.g., 'Category 1', 'Category 2', etc.).

COMPLAINT:
```{}```
CATEGORY:

"""

prompt_Harm_minZero = "\n".join([min_definition_Harm, instructions_Harm])
prompt_Harm_maxZero = "\n".join([max_definition_Harm, instructions_Harm])
prompt_Harm_maxFew = "\n".join([max_definition_Harm, examples_Harm, instructions_Harm])

## Load harm data

In [31]:
# convert Harm to number
Harm_dic = {'NoHarm':1, 'Minimal':2, 'Minor':3, 'Moderate':4, 'Major':5, 'Catastrophic':6}
df_Harm0['Harm_num'] = df_Harm0['Harm'].replace(Harm_dic)

## Summarise harm data

In [None]:
# Summarising harm:
input_texts_Harm = df_Harm["Text"].to_list()
df_Harm["summaries"] = run_classifier(prompt_Summary, role_Summary, input_texts_Harm, model = "gpt-4o")

## Classifying harm with LLM

In [None]:
input_texts_Harm = df_Harm["summaries"].to_list() 
llm_scores_Harm = run_classifier(prompt_Harm_maxFew, role_Harm, input_texts_Harm, model = "gpt-4o")

In [38]:
# Add the new columns to the existing df DataFrame
df_Harm['llm_harm'] = llm_scores_Harm
df_Harm['llm_harm_num'] = convert_llm_scores_Harm(llm_scores_Harm)

## Predictive validity test

In [None]:
human_harm = df_Harm['Harm_num'].to_list()
llm_harm = df_Harm['llm_harm_num'].to_list()
percent_agreement = sum([1 for i, j in zip(human_harm, llm_harm) if i == j])/len(human_harm)
print(f'Percent agreement: {percent_agreement:.2f}')
kappa = cohen_kappa_score(human_harm, llm_harm, weights='quadratic')
print(f'Cohens Weighted Kappa for ordinal classifications: {kappa:.2f}')

## Qualitative content validity checks

In [41]:
df_Harm2 = df_Harm.copy()
df_Harm2['discrepancy'] = df_Harm2['Harm_num'] - df_Harm2['llm_harm_num']
df_Harm2 = df_Harm2.drop(columns=['Harm_num', 'llm_harm_num'])
df_Harm2['llm_harm'] = df_Harm2['llm_harm'].str.replace('"', '').str.strip()
df_Harm2['Text'] = df_Harm2['Text'].str.replace('\n', ' ----- ')


In [None]:
# investigate false negatives
df_fp = df_Harm2[df_Harm2['Harm']!=df_Harm2['llm_harm']].copy()
# sort by discrepancy
df_fp = df_fp.sort_values(by=['discrepancy'], ascending=False)
# add title
df_fp = df_fp.style.set_caption(f"Harm: Disagreements (n={len(df_Harm2)})")
# display
with pd.option_context('display.max_colwidth', 0):
    display(df_fp)