# What impact does emotional tone (anger, sadness, and anxiety) in prompts have on the Large Language Model responses?

Which emotion has the highest impact on quality in LLM responses? We believe that such emotions (anger, sadness, and anxiety) may have varying effects on LLM output quality. For example, prompts with high anxiety may produce 'better' prompts because the model could sense the urgency in the situation.

How can we do this? We must curate public datasets of real conversations between users and LLMs. In this project we will primarily be working with WildChat and ShareGPT52k, which hold around 1 million conversations.

To analyze emotional tone in prompts, we utilize a proprietary service, LIWC, that calculates the percentage of words in a sentence that relates to specific categories. To measure Large Language Model responses, we will either use BigBench or HELM, whichever we get to work. Additionally, we use matplotlib and seaborn to visualize our findings.

In step 1 we load the data from HuggingFace and filter them based on keywords we believe pertain to the outputs we're measuring

In [17]:
# Import datasets
from datasets import load_dataset
wildchat = load_dataset("allenai/WildChat-1M", split='train')
sharegpt = load_dataset('RyokoAI/ShareGPT52K', split='train', streaming=True)

In [18]:
wildchat_convo = []

MAX_WORD_COUNT = 50  # Set a reasonable word count limit for prompts

for conversation in wildchat['conversation']:
    # Collect only the first turn of each conversation
    user_turn = conversation[0]

    if user_turn.get('language') == "English":
        prompt = user_turn.get('content', '').strip().lower()

        # Ensure the prompt is not too large
        if len(prompt.split()) < MAX_WORD_COUNT:
            wildchat_convo.append({
                'prompt': user_turn['content'],
                'response': conversation[1].get('content', '').strip() if len(conversation) > 1 else ''
            })
            
    if len(wildchat_convo) >= 150:
        break

In [19]:
from langdetect import detect

sharegpt_convo = []

# Helper function
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

MAX_WORD_COUNT = 50  # Set a reasonable word count limit

for example in sharegpt:
    messages = example.get("conversations", [])

    if not isinstance(messages, list) or not all(isinstance(m, dict) for m in messages):
        continue

    if len(messages) < 2:
        continue  # Need at least one user and one assistant message

    user = messages[0]
    bot = messages[1]

    prompt = user.get("value", "").strip()
    response = bot.get("value", "").strip()

    if (
        is_english(prompt)
        and is_english(response)
        and len(prompt.split()) < MAX_WORD_COUNT
    ):
        sharegpt_convo.append({"prompt": prompt, "response": response})

    if len(sharegpt_convo) >= 150:
        break

In [20]:
convos = wildchat_convo + sharegpt_convo

In step 2, we will run our natural prompts through LangChain using the gemma2-9b-it and llama-3.3-70b-versatile supported models to variate our prompts and produce corresponding model responses

In [22]:
!pip install --quiet langchain langchain-groq  langchain-core

GROQ_API_KEY=""

from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

chat = ChatGroq(temperature=2, groq_api_key=GROQ_API_KEY, model_name="gemma2-9b-it") # And 'llama-3.3-70b-versatile'

In [23]:
system = "You are an assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])
chain = prompt | chat

def emo_gen(text, emotion):
    response = chain.invoke({
        "text": f"""Generate 5 rephrasings of the following prompt that reflect increasing levels of {emotion}, from 1 (very mild) to 5 (extremely intense). 
Number each version on a new line like this (keep your answer between 150-200 words with no emojis):

1. [mild {emotion}]
2. [slightly more intense {emotion}]
3. ...
5. [most intense {emotion}]

Prompt: "{text}"
"""
    })
    return response.content

In [25]:
import re

pattern = r"(?m)^(\d+)\.\s+(.*)"

anger_prompts = []
sad_prompts = []
anxious_prompts = []

joyful_prompts = []
gratitude_prompts = []
hopeful_prompts = []

for row in convos:
    prompt = row['prompt']

    anger_prompt = emo_gen(prompt, "anger")
    sad_prompt = emo_gen(prompt, "sadness")
    anxious_prompt = emo_gen(prompt, "anxious")

    joyful_prompt = emo_gen(prompt, "Joyfulness")
    gratitude_prompt = emo_gen(prompt, "Gratitude")
    hopeful_prompt = emo_gen(prompt, "Hopefulness")
    
    anger_sentences = [s for _, s in re.findall(pattern, anger_prompt)]
    sad_sentences = [s for _, s in re.findall(pattern, sad_prompt)]
    anxious_sentences = [s for _, s in re.findall(pattern, anxious_prompt)]

    joyful_sentences = [s for _, s in re.findall(pattern, joyful_prompt)]
    gratitude_sentences = [s for _, s in re.findall(pattern, gratitude_prompt)]
    hopeful_sentences = [s for _, s in re.findall(pattern, hopeful_prompt)]
    
    if len(anger_sentences) == 5:
        anger_prompts.append(anger_sentences)
    if len(sad_sentences) == 5:
        sad_prompts.append(sad_sentences)
    if len(anxious_sentences) == 5:
        anxious_prompts.append(anxious_sentences)

    if len(joyful_sentences) == 5:
        joyful_prompts.append(joyful_sentences)
    if len(gratitude_sentences) == 5:
        gratitude_prompts.append(gratitude_sentences)
    if len(hopeful_sentences) == 5:
        hopeful_prompts.append(hopeful_sentences)

In [None]:
from tqdm import tqdm

def generate_emotion_responses(prompt_groups, label="Emotion"):
    output = []

    for prompt_group in tqdm(prompt_groups, desc=f"Generating responses for {label}"):
        for i, prompt_text in enumerate(prompt_group):
            intensity = i + 1  # Levels 1–5

            try:
                response = chain.invoke({
                    "text": f"{prompt_text}\n\n(Keep your answer between 300-400 words with no emojis.)"
                })

                output.append({
                    "intensity": intensity,
                    "prompt": prompt_text,
                    "response": response.content
                })

            except Exception as e:
                print(f"[{label}] Error with prompt: {prompt_text}\n{e}")
    
    return output

anger_convos = generate_emotion_responses(anger_prompts, label="Anger")
sad_convos = generate_emotion_responses(sad_prompts, label="Sad")
anxious_convos = generate_emotion_responses(anxious_prompts, label="Anxious")

joyful_convos = generate_emotion_responses(joyful_prompts, label="Joyfulness")
gratitude_convos = generate_emotion_responses(gratitude_prompts, label="Gratitude")
hopeful_convos = generate_emotion_responses(hopeful_prompts, label="Hopefulness")

In [27]:
import json

convos = {
    "anger": anger_convos,
    "sad": sad_convos,
    "anxious": anxious_convos,
    "joyful": joyful_convos,
    "gratitude": gratitude_convos,
    "hopeful": hopeful_convos
}

with open("gemma_convos.json", "w") as f:
    json.dump(convos, f, indent=2)

In step 3, we measure our outputs using metrics of perplexity, lexical diversity, and readability.

In [None]:
import os
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import textstat
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# --- Perplexity Calculator ---
class PerplexityCalculator:
    def __init__(self, model_name='gpt2'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu")
        )
    def perplexity(self, text):
        enc = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=1024).to(self.model.device)
        with torch.no_grad():
            loss = self.model(**enc, labels=enc['input_ids']).loss
        return torch.exp(loss).item()

# --- Lexical diversity ---
def lexical_diversity(text):
    tokens = word_tokenize(text.lower())
    return len(set(tokens)) / len(tokens) if tokens else 0.0

# --- Evaluate a list of responses ---
def evaluate_data(data, pp_calc):
    records = []
    for item in data:
        resp = item.get('response', '')
        records.append({
            'intensity': item.get('intensity', None),
            'readability': textstat.flesch_reading_ease(resp),
            'perplexity': pp_calc.perplexity(resp),
            'lex_diversity': lexical_diversity(resp)
        })
    return pd.DataFrame(records)

def plot_grouped_metric(all_metrics, metric_name):
    for model in all_metrics:
        combined = []
        for emotion, df in all_metrics[model].items():
            for _, row in df.iterrows():
                combined.append({
                    'emotion': emotion,
                    'intensity': int(row['intensity']),  # ensure numeric
                    'value': row[metric_name]
                })
        plot_df = pd.DataFrame(combined)
        plot_df.sort_values(by='intensity', inplace=True)

        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=plot_df,
            x='intensity',
            y='value',
            hue='emotion',
            palette='Set2'
        )
        plt.title(f"{model.upper()} – {metric_name.replace('_', ' ').title()} by Emotion and Intensity")
        plt.xlabel("Intensity (1–5)")
        plt.ylabel(metric_name.replace('_', ' ').title())
        plt.xticks([0, 1, 2, 3, 4], ['1', '2', '3', '4', '5'])  # ensure correct labels
        plt.legend(title="Emotion", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        filename = f"{model}_{metric_name}_grouped.png"
        plt.savefig(filename)
        print(f"📊 Saved grouped plot → {filename}")
        plt.show()

# --- Main ---
def main():
    # Load JSON files
    model_files = {
        'gemma': 'gemma_convos.json',
        'llama': 'llama_convos.json'
    }

    # Initialize perplexity calculator
    pp_calc = PerplexityCalculator('gpt2')

    # Storage for all results
    all_metrics = {}

    for model_name, filepath in model_files.items():
        if not os.path.exists(filepath):
            print(f"❌ File not found: {filepath}")
            continue

        with open(filepath, 'r') as f:
            model_data = json.load(f)  # This is a dict of emotion → list of dicts

        all_metrics[model_name] = {}

        for emotion, responses in model_data.items():
            print(f"▶ Evaluating {model_name} / {emotion}...")
            df = evaluate_data(responses, pp_calc)
            all_metrics[model_name][emotion] = df

            out_file = f"{model_name}_{emotion}_metrics.csv"
            df.to_csv(out_file, index=False)
            print(f"✔ Saved → {out_file}")

    # --- Plot comparison across models ---
    for metric in ['readability', 'perplexity', 'lex_diversity']:
        plot_grouped_metric(all_metrics, metric)

if __name__ == "__main__":
    main()