In [8]:
import pandas as pd
import os
from PIL import Image
if os.getcwd().split(os.sep)[-1] == 'notebooks':
    os.chdir('..') # set/ ccwd as the parent directory to make imports easier
import json
import ast
import re

from serve.utils_llm import get_llm_output

In [9]:
df = pd.read_csv('results.csv')
# Step 1: Split the 'prompt' into 'question' and 'answers'
df['question'] = df['prompt'].apply(lambda x: x.split("\nModel A: ")[0])
df['answers'] = df['prompt'].apply(lambda x: x.split("\nModel A: ")[1])

# Step 2: Further split 'answers' into 'Model A' and 'Model B'
df['answer_a'] = df['answers'].apply(lambda x: x.split("\nModel B ")[0].strip("[]").replace("'", ""))
df['answer_b'] = df['answers'].apply(lambda x: x.split("\nModel B: ")[1].strip("[]").replace("'", ""))

# Clean up
df.drop('answers', axis=1, inplace=True)  # Remove the intermediate 'answers' column

In [10]:
# Regular expression to match entities: Capitalized words or phrases followed by a colon
# regex_pattern = r'-\s*([A-Za-z ]+):'
regex_pattern = r'-\s*(?:\*\*)?([A-Za-z ]+?)(?:\*\*)?:'

def extract_entities(text):
    matches = re.findall(regex_pattern, text)
    return [m for m in matches if m not in ["Model A", "Model B"]]

df['entities'] = df['response'].apply(extract_entities)

smaller_systems_prompt = "You are a helpful assistant. Your outputs adhere to the format given by the user."

In [11]:
# print(df[df['entities'].apply(len) == 0].iloc[0]['response'])
all_entities = set([e for entities in df['entities'] for e in entities])

def extract_axis_descriptions(text):

    lines = text.strip().split('\n')

    # Initialize variables to construct the sections
    sections = []
    current_section = []

    # Process each line, building sections while excluding model scores
    for line in lines:
        # Check if the line starts a new section or is part of the current one
        if line.startswith('- ') and current_section:  # If starting a new section and there is a current section
            # Join the current section lines and add to sections
            sections.append('\n'.join(current_section).strip().replace("- ", "").replace("\n", ""))
            current_section = [line]  # Start a new section
        elif "Model A Score" not in line and "Model B Score" not in line:
            # If the line is not a model score, add it to the current section
            current_section.append(line)

    # Don't forget to add the last section
    if current_section:
        sections.append('\n'.join(current_section).strip().replace("- ", "").replace("\n", ""))
    return sections

df['axis_description'] = df['axis_response'].apply(extract_axis_descriptions)
df.head()

Unnamed: 0,prompt,response,axes,axis_response,question,answer_a,answer_b,entities,axis_description
0,Is it unreasonable to double your investment y...,- Complexity of language: \n - Model A uses m...,"['Complexity of language', 'Specificity of exa...","- Complexity of language:\n High: Formal, t...",Is it unreasonable to double your investment y...,I know it may not last longer but i was able t...,It is not unreasonable to aim for a high rate ...,"[Complexity of language, Specificity of exampl...","[Complexity of language: High: Formal, tech..."
1,How a computer randomly generates something Ho...,- Complexity of Explanation: Model A provides ...,"['Complexity of Explanation', 'Emphasis on Pre...",- Complexity of Explanation:\n High: Techni...,How a computer randomly generates something Ho...,"""Computers have no imagination , they just do ...","To generate a random link, the computer uses a...","[Complexity of Explanation, Emphasis on Predic...",[Complexity of Explanation: High: Technical...
2,"Q. What can cause panic attacks?Hi doctor,I ha...",- Coverage of Information: Model B provides a ...,"['Coverage of Information', 'Engagement with t...",- Coverage of Information:\n High: Detailed...,"Q. What can cause panic attacks?Hi doctor,I ha...",Hi. For further information consult a psychiat...,There are several potential causes of panic at...,"[Coverage of Information, Engagement with the ...",[Coverage of Information: High: Detailed in...
3,The differences between a liberal and a conser...,- **Scope of Explanation**: \n - **Model A**:...,[],- Scope of Explanation:\n High: Detailed an...,The differences between a liberal and a conser...,The term liberal used to be synonymous with li...,"""Liberal and conservative are two political id...","[Scope of Explanation, Level of Detail, Tone a...",[Scope of Explanation: High: Detailed analy...
4,Why does bright light ( i.e. a flashlight ) ca...,- Depth of Explanation: Model A provides a mor...,"['Depth of Explanation', 'Biological Terminolo...","- Depth of Explanation:\n High: Detailed, l...",Why does bright light ( i.e. a flashlight ) ca...,"""The iris controls the amount of light that en...","When you shine a bright light in your eyes, it...","[Depth of Explanation, Biological Terminology ...","[Depth of Explanation: High: Detailed, laye..."


In [18]:
all_axis_descriptions = set([e for entities in df['axis_description'] for e in entities])
dup_axis_descriptions = [e for entities in df['axis_description'] for e in entities]
len(all_axis_descriptions), len(dup_axis_descriptions)

(333, 333)

In [19]:
all_axis_descriptions

{'Accessibility:    High: Relatable examples, everyday scenarios.    Low: Focus on anatomy and physiology.',
 'Actionability:    High: Urgency for medical attention    Low: General advice',
 'Actionable Advice:    High: Explicit instructions, specific steps    Low: Considerations without instructions',
 'Adaptability:    High: Customizable and flexible    Low: Predefined rules',
 "Additional Information:    High: Includes extra context    Low: Doesn't expand beyond query",
 'Advice Clarity:    High: Direct, specific guidance    Low: Lacks clear advice',
 'Analogical Approach:    High: Uses analogies, simple language.    Low: Relies on technical terms.',
 'Analogical Reasoning:    High: Heavy analogies and visualizations used.    Low: Direct factual language employed.',
 'Analogies Used:    High: Relatable everyday examples, vivid analogies    Low: Listing potential factors, straightforward explanation',
 'Analogies and Comparisons:    High: Uses analogies for clarity.    Low: Presents 

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Your list of axes of variation and descriptions
axes_descriptions = all_axis_descriptions

# Extract just the descriptions for vectorization
descriptions = [desc.split(": ", 1)[1] for desc in axes_descriptions]

# Vectorize the descriptions
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(descriptions)

# Cluster the vectorized descriptions
n_clusters = 10  # Adjust based on your preference or use methods like Elbow to find the optimal number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

# Assign each description to a cluster
clusters = kmeans.labels_

# Group axes by cluster
grouped_axes = {i: [] for i in range(n_clusters)}
for axis, cluster in zip(axes_descriptions, clusters):
    grouped_axes[cluster].append(axis)

# Print grouped axes
for cluster, axes in grouped_axes.items():
    print(f"Cluster {cluster + 1} (length = {len(axes)}):")
    for axis in axes:
        print(f"  - {axis}")
    print("")  # New line for readability between clusters


Cluster 1 (length = 12):
  - Depth of Information:    High: Comprehensive, detailed    Low: Limited, narrow
  - Emphasis on proven treatments:    High: Strong advocacy for evidence-based practices.    Low: Limited stance on current use.
  - Emphasis on risk assessment:    High: Extensive risk evaluation    Low: Limited risk assessment
  - Perspective on Ownership Calculation:    High: Detailed financial scenarios    Low: General step-by-step guide
  - Depth of analysis:    High: Detailed and comprehensive exploration.    Low: Limited scope and anecdotal.
  - Completeness of Information:    High: Covers a wide range of scenarios    Low: Focuses on a limited scope
  - Consideration of Consequences:    High: Thorough analysis    Low: Limited focus
  - Application Focus:    High: Broad range of applications covered    Low: Narrow focus on specific domain
  - Application of Analogies:    High: Real-life scenarios used.    Low: Tool analogies employed.
  - Engagement with Counterarguments:  



In [22]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

axes_descriptions = all_axis_descriptions
# Extract just the descriptions for embeddings
descriptions = [desc.split(": ", 1)[1] for desc in axes_descriptions]

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each description
embeddings = model.encode(descriptions)

# Cluster the embeddings
n_clusters = 10  # Adjust based on your preference or use methods like Elbow to find the optimal number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings)

# Assign each description to a cluster
clusters = kmeans.labels_

# Group axes by cluster
grouped_axes = {i: [] for i in range(n_clusters)}
for axis, cluster in zip(axes_descriptions, clusters):
    grouped_axes[cluster].append(axis)

# Print grouped axes
for cluster, axes in grouped_axes.items():
    print(f"Cluster {cluster + 1} (length = {len(axes)}):")
    for axis in axes:
        print(f"  - {axis}")
    print("")  # New line for readability between clusters

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
Batches: 100%|██████████| 11/11 [00:00<00:00, 51.54it/s]


Cluster 1 (length = 12):
  - Extrapolation to Other Economic Factors:    High: Broad economic implications    Low: Direct housing market effects
  - Emphasis on risk assessment:    High: Extensive risk evaluation    Low: Limited risk assessment
  - Focus:    High: Optimizing contributions for self-employed.    Low: General benefits for diversified savings.
  - Influence of Monetary Policy Objectives:    High: Varied consumer impact consideration    Low: General monetary policy discussion
  - Complexity of Information:    High: Detailed credit strategy    Low: Simplified credit factors
  - Perspective on Ownership Calculation:    High: Detailed financial scenarios    Low: General step-by-step guide
  - Scope of Factors Considered:    High: Consumer-specific, demographic impact    Low: Broad economic indicators
  - Perspective:    High: Detailed financial scenarios.    Low: General benefits overview.
  - Focus on financial education:    High: Emphasis on financial literacy and education.

In [62]:
# cluster_axes_descriptions_prompt = ["""I am trying to explain differences in the behavior of two LLM's by comparing their outputs over a dataset of question answer tuples. I have several descriptions of axes of variation found with the meanings of what it means to be low and high on this axis. 

# {axes}

# Some of these axes of variations could be named incorrectly or redundant with other axes. Please return a numbered list of new axis descriptions that fit the low and high descriptions. Your new set of axes should be distinct so each of the above axes fit under exactly one of your new axes. Please ensure you descriptions of what is considered high and low on each axis is clear, concise, and easy for a human to understand.""", 
# """thanks! Now can you please categorize each of the original axes under you new list of axes? Remember that each original axis should only belong to one of the axes you described. Here are the original axes again for references:

# {axes}

# Please structure your response as:

# {{new axis}}:  High: {{new axis high description}} Low: {{new axis low description}}
# - {{original axis 1}}:  High: {{original axis high description}} Low: {{original axis low description}}
# - {{original axis 2}}:  High: {{original axis high description}} Low: {{original axis low description}}"""]
# smaller_systems_prompt = "You are a helpful assistant. Your outputs adhere to the format given by the user."

# cluster_1 = grouped_axes[3]
# prompt_1 = cluster_axes_descriptions_prompt[0].format(axes="\n".join(cluster_1))
# cluster_1_reduced_axes = get_llm_output(prompt_1, model="gpt-4", system_prompt=smaller_systems_prompt)
# print(cluster_1_reduced_axes)

HERE
systems prompt  You are a helpful assistant. Your outputs adhere to the format given by the user.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'1. Tone Formality: High refers to a formal, professional, and informative tone. Low represents a conversational, subjective, informal, and slightly casual tone.\n\n2. Engagement Level: High involves a personalized, engaging and conversational approach. Low signifies a formal approach which lacks personalization and can be distant in nature.\n\n3. Language Complexity: High corresponds to a technical, formal language, usually academic or technical. Low denotes simpler, personal, and accessible language characterized by an informal tone.\n\n4. Style Variation: High refers to a personal and engaging style inclusive of informal and relaxed tones. Low has a more informative, historical, and explanatory style with a formal tone.\n\n5. Sensitivity Level: High suggests a cautious, professional tonewith formality. Low embodies a more informal tone with casual undertones. \n\n6. Approach: High is characterized by a more conversational and informal approach while Low involves a formal, explanator

In [65]:
# history = [{"role": "user", "content": prompt_1}, {"role": "assistant", "content": cluster_1_reduced_axes}]
# prompt_2 = cluster_axes_descriptions_prompt[1].format(axes="\n".join(cluster_1))
# cluster_1_reduced_axes_categorized = get_llm_output(prompt_2, model="gpt-4", system_prompt=smaller_systems_prompt, history=history)
# print(cluster_1_reduced_axes_categorized)

HERE
systems prompt  You are a helpful assistant. Your outputs adhere to the format given by the user.
LLM Cache Hit
1. Tone Formality: High: This involves a formal, professional, and informative tone. Low: This refers to a conversational, subjective, informal, and slightly casual tone.
   - Tone: High: Formal and objective Low: Conversational and subjective
   - Tone: High: Formal and professional Low: Informal and conversational
   - Tone: High: Formal, informative Low: Casual, conversational
   - Tone: High: Formal and informative Low: Informal and conversational

2. Engagement Level: High: This involves a personalized, engaging and conversational approach. Low: This represents a formal approach that lacks personalization and can be distant in nature.
   - Personalization and Sign-off: High: Personalized, engaging Low: Formal, distant
   - Tone and Engagement Level: High: Conversational, engaging Low: Formal, lacks personalization

3. Language Complexity: High: This corresponds to a

In [78]:
# conversion_prompt = """Below is a numbered list of axes of variation with their high and low descriptions, along with the original axes categorized under them. Please convert this list into a JSON format and return it.

# {axes}

# Please format you JSON response such that the keys are the axes of varation along with their high and low descriptions and the values are a list of the original axes with their high and low descriptions categorized under them. 

# An example JSON format is shown below:
# "{{new_axis_1_with_high_low}}" : ["{{original_axis_1_with_high_low}}", "{{original_axis_2_with_high_low}}"]

# I should be able to take this response directly and convert it into a Python object with ast.literal_eval().
# """
# converted_list = get_llm_output(conversion_prompt.format(axes=cluster_1_reduced_axes_categorized), model="gpt-3.5-turbo", system_prompt=smaller_systems_prompt)
# print(converted_list)

HERE
systems prompt  You are a helpful assistant. Your outputs adhere to the format given by the user.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{
  "Tone Formality: High: This involves a formal, professional, and informative tone. Low: This refers to a conversational, subjective, informal, and slightly casual tone.": [
    "Tone: High: Formal and objective Low: Conversational and subjective",
    "Tone: High: Formal and professional Low: Informal and conversational",
    "Tone: High: Formal, informative Low: Casual, conversational",
    "Tone: High: Formal and informative Low: Informal and conversational"
  ],
  "Engagement Level: High: This involves a personalized, engaging and conversational approach. Low: This represents a formal approach that lacks personalization and can be distant in nature.": [
    "Personalization and Sign-off: High: Personalized, engaging Low: Formal, distant",
    "Tone and Engagement Level: High: Conversational, engaging Low: Formal, lacks personalization"
  ],
  "Language Complexity: High: This corresponds to a technical, formal language which is usually academic or technical. Low: This represents 

In [23]:
def get_cluster_axes(cluster):
    cluster_axes_descriptions_prompt = ["""I am trying to explain differences in the behavior of two LLM's by comparing their outputs over a dataset of question answer tuples. I have several descriptions of axes of variation found with the meanings of what it means to be low and high on this axis. 

    {axes}

    Some of these axes of variations could be named incorrectly or redundant with other axes. Please return a numbered list of new axis descriptions that fit the low and high descriptions. Your new set of axes should be distinct so each of the above axes fit under exactly one of your new axes. Please ensure you descriptions of what is considered high and low on each axis is clear, concise, and easy for a human to understand.""", 
    """thanks! Now can you please categorize each of the original axes under you new list of axes? Remember that each original axis should only belong to one of the axes you described. Here are the original axes again for references:

    {axes}

    Please structure your response as:

    {{new axis}}:  High: {{new axis high description}} Low: {{new axis low description}}
    - {{original axis 1}}:  High: {{original axis high description}} Low: {{original axis low description}}
    - {{original axis 2}}:  High: {{original axis high description}} Low: {{original axis low description}}"""]
    smaller_systems_prompt = "You are a helpful assistant. Your outputs adhere to the format given by the user."

    prompt_1 = cluster_axes_descriptions_prompt[0].format(axes="\n".join(cluster))
    cluster_1_reduced_axes = get_llm_output(prompt_1, model="gpt-4-0125-preview", system_prompt=smaller_systems_prompt)

    history = [{"role": "user", "content": prompt_1}, {"role": "assistant", "content": cluster_1_reduced_axes}]
    prompt_2 = cluster_axes_descriptions_prompt[1].format(axes="\n".join(cluster))
    cluster_1_reduced_axes_categorized = get_llm_output(prompt_2, model="gpt-4-0125-preview", system_prompt=smaller_systems_prompt, history=history)

    return prompt_1, prompt_2, cluster_1_reduced_axes_categorized

def convert_axes_clusters_to_df(llm_output):
    conversion_prompt = """Below is a numbered list of axes of variation with their high and low descriptions, along with the original axes categorized under them. Please convert this list into a JSON format and return it.

    {axes}

    Please format you JSON response such that the keys are the axes of varation along with their high and low descriptions and the values are a list of the original axes with their high and low descriptions categorized under them. 

    An example JSON format is shown below:
    "{{new_axis_1_with_high_low}}" : ["{{original_axis_1_with_high_low}}", "{{original_axis_2_with_high_low}}"]

    I should be able to take this response directly and convert it into a Python object with ast.literal_eval().
    """
    for i in range(3):
        try:
            converted_list = get_llm_output(conversion_prompt.format(axes=llm_output), model="gpt-3.5-turbo", system_prompt=smaller_systems_prompt)
            cluster_1_converted_list = ast.literal_eval(converted_list)
            # Creating lists to store the processed data
            sub_axes_list = []
            axis_list = []

            # Looping through the dictionary to fill the lists
            for axis, sub_axes in cluster_1_converted_list.items():
                for sub_axis in sub_axes:
                    axis_list.append(axis)
                    sub_axes_list.append(sub_axis)

            # Creating a DataFrame from the lists
            return pd.DataFrame({
                'sub_axes': sub_axes_list,
                'axis': axis_list
            })
        except:
            converted_list = get_llm_output(conversion_prompt.format(axes=llm_output), model="gpt-3.5-turbo", system_prompt=smaller_systems_prompt, cache=False)

all_cluster_axes, all_df_cluster = [], []
for cluster, axes in grouped_axes.items():
    if cluster > 3:
        break
    cluster_axes = get_cluster_axes(axes)
    df_cluster = convert_axes_clusters_to_df(cluster_axes)
    df_cluster['cluster'] = cluster + 1
    all_cluster_axes.append(cluster_axes)
    all_df_cluster.append(df_cluster)
    print(f"Cluster {cluster + 1} (length = {len(axes)}) (df length = {len(df_cluster)}):")
    print("")  # New line for readability between clusters

df_cluster = pd.concat(all_df_cluster)

HERE
systems prompt  You are a helpful assistant. Your outputs adhere to the format given by the user.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [97]:
for cluster, axes in grouped_axes.items():
    print(f"Cluster {cluster + 1} (length = {len(axes)}) (df length = {len(df_cluster[df_cluster['cluster'] == cluster+1])}):")

Cluster 1 (length = 42) (df length = 38):
Cluster 2 (length = 18) (df length = 18):
Cluster 3 (length = 124) (df length = 44):
Cluster 4 (length = 13) (df length = 12):
Cluster 5 (length = 15) (df length = 0):
Cluster 6 (length = 14) (df length = 0):
Cluster 7 (length = 18) (df length = 0):
Cluster 8 (length = 34) (df length = 0):
Cluster 9 (length = 17) (df length = 0):
Cluster 10 (length = 38) (df length = 0):


In [94]:
# get all values in list of values across keys
clusters = []
for cluster, axes in grouped_axes.items():
    for axis in axes:
        clusters.append((cluster, axis))
len(clusters)

333

In [91]:
list(grouped_axes.values())[0]

['Specificity of Examples:    High: Detailed examples    Low: Generalized examples',
 'Use of Examples:    High: Utilizes practical examples.    Low: Lacks detailed examples.',
 'Detail Level:    High: Specific team details    Low: General overview',
 'Diagnostic Approach:    High: Immediate specific diagnosis and treatment    Low: Emphasizing need for professional evaluation',
 'Actionable Advice:    High: Explicit instructions, specific steps    Low: Considerations without instructions',
 'Specific Examples:    High: Detailed Unit Examples    Low: Generalized Unit Examples',
 'Depth of treatment detail:    High: Specific medication details included.    Low: General overview without specifics.',
 'Focus on Interpretation:    High: Critiquing specific content    Low: Describing experiment behavior',
 'Guidance on Decision-Making:    High: Specific and trust-focused    Low: General and advisory',
 'Specificity of examples:    High: Detailed, specific examples.    Low: General, abstract 