In [14]:
import pandas as pd
import os
import re

# It's All About the VIBES

Walkthrough of how to run VibeCheck on your data using [LOTUS](https://lotus-ai.readthedocs.io/en/latest/index.html), an nice pandas wrapper for using LLM's to do data analysis.

If you haven't already, install LOTUS with `pip install git+https://github.com/guestrin-lab/lotus.git`. It's seriously such a conventient tool if you do a lot of these llm chains. 

## Step 1: Initialize models and load data

In [15]:
from components.proposer import parse_bullets
import lotus
from lotus.models import LM, SentenceTransformersRM
from lotus.cache import CacheConfig, CacheType, CacheFactory

cache_config = CacheConfig(cache_type=CacheType.SQLITE, max_size=1000)
cache = CacheFactory.create_cache(cache_config)
lm = LM(model="gpt-4o-mini", cache=cache)
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
lotus.settings.configure(lm=lm, rm=rm)

models = ["friendly", "cold"] # these are the names of the columns in the dataframe which correspond to the outputs of two models you want to compare

df = pd.read_csv("data/friendly_and_cold_sample.csv")

## Step 2: Propose vibes (axes of variation)

Get differences between two pairs of text outputs to extract possible vibes. Since we use a more expensive model for the proposer, we sample a small number of rows to get a sense of the axes of variation. 

In [16]:
def proposer_postprocess(text):
  bullets = parse_bullets(text)
  # strip any ** or - from each bullet
  bullets = [b.replace("**", "").replace("-", "") for b in bullets]
  return bullets

df["combined_responses"] = df.apply(lambda row: f"User prompt:\n{row['question']}\n\nModel 1:\n{row['friendly']}\n\nModel 2:\n{row['cold']}", axis=1)
proposer_prompt_freeform = """
You are a machine learning researcher trying to figure out the major differences between the behaviors of two llms by finding differences in their responses to the same set of questions. Write down as many differences as you can find between the two outputs. Please format your differences as a list of axes of variation and differences between the two outputs. Try to give axes which represent a property that a human could easily interpret and they could categorize a pair of text outputs as higher or lower on that specific axis. 

Here is the question and the two responses:
{combined_responses}

The format should be
- {{axis_1}}: {{difference}}
- {{axis_2}}: {{difference}}
    
If there are no substantive differences between the outputs, please respond with only "No differences found."
"""

proposer_df = df.sample(10)
proposer_df = proposer_df.sem_map(proposer_prompt_freeform, return_raw_outputs=True, suffix="differences")
proposer_df["differences"] = proposer_df["differences"].apply(proposer_postprocess)
results = proposer_df[proposer_df["differences"].apply(lambda x: len(x) > 0)]
results = results.explode("differences")
results.head()

Mapping: 100%|██████████ 10/10 LM calls [00:05<00:00,  1.69it/s]


Unnamed: 0,question,category,friendly,cold,a_type,b_type,preference,combined_responses,differences,raw_outputdifferences
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,Tone: Model 1 has a more playful and flirtatio...,- **Tone**: Model 1 has a more playful and fli...
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,"Characterization of Melina: In Model 1, Melina...",- **Tone**: Model 1 has a more playful and fli...
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,Characterization of Oscar: Model 1 presents Os...,- **Tone**: Model 1 has a more playful and fli...
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,Dialogue Structure: Model 1 features a more st...,- **Tone**: Model 1 has a more playful and fli...
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,Emotional Depth: Model 1 conveys a sense of de...,- **Tone**: Model 1 has a more playful and fli...


## Reduce axes

Since many of these differences are redundant or too specific, we can use clustering and LLM summarization to reduce the number of axes. Since our toy example is small, we will set the number of clusters to 1 but in practice you can set this to a higher number if you have more data. 

In [20]:
results = results.sem_index("differences", "differences_index").sem_cluster_by("differences", 1)
results.head(2)

100%|██████████| 1/1 [00:00<00:00,  3.86it/s]


Unnamed: 0,question,category,friendly,cold,a_type,b_type,preference,combined_responses,differences,raw_outputdifferences,cluster_id
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,Tone: Model 1 has a more playful and flirtatio...,- **Tone**: Model 1 has a more playful and fli...,0
8,I want you to act as a screenwriter. This unus...,Creative writing,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,INT. MELINA'S APARTMENT - LIVING ROOM - DAY\n\...,friendly,cold,friendly,User prompt:\nI want you to act as a screenwri...,"Characterization of Melina: In Model 1, Melina...",- **Tone**: Model 1 has a more playful and fli...,0


In [18]:
def create_reduce_prompt(num_reduced_axes):
    return f"""Below is a list of axes with a description of what makes a piece of text low or high on this axis. I would like to summarize this list to at most {num_reduced_axes} representative axes.

Here is the list of axes:
{{differences}}

These axes should contain only one concept and should be human interpretable. Some examples of bad axes include:
- "Configuration Clarity: High: Clearly defined structure and purpose. Low: Vaguely defined, minimal purpose." -> This axis is bad because it is not clear what a clearly defined purpose means nor what a vaguely defined purpose means. 
- "Language and Communication: High: Varied/precise, complex structure. Low: Straightforward, simple or general language." -> This axis is bad because it combines multiple concepts into one axis.
- "Content Quality: High: High quality, engaging, informative. Low: Low quality, unengaging, uninformative." -> This axis is bad because it is not clear what high quality means nor what low quality means.

Some examples of good axes include:
- "Complexity: High: Complex, multi-layered, intricate. Low: Simple, straightforward, easy to understand."
- "Efficiency (coding): High: Code optimized for runtime, minimal memory usage. Low: Code inefficient, high memory usage."

Some examples of axes which should be combined include:
- "Emotional Tone: High: Contains emotionally charged language. Low: Maintains a neutral tone." and "Empathy: High: Shows empathy. Low: Only factual answers without empathy." are redundant because they both measure the emotional content of the text. If two similar axes are found, keep the one that is more informative or more specific.

Please return the simplified list of <={num_reduced_axes} axes with any redundant axes removed and the descriptions of what makes a piece of text low or high on this axis simplified. Are there any axes which convey roughly the same information? Are there any axes where almost all samples which score highly on one axis would also score highly on the other? 

Please maintain the format of the original axes and return a numbered list. Each element should be structured as follows:
"{{{{axis_name}}}}: High: {{{{high description}}}} Low: {{{{low description}}}}" """

def parse_axes(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    axes = []
    for line in lines:
        cleaned = line.strip('1234567890. -"')
        cleaned = cleaned.replace("**", "")
        if cleaned:
            axes.append(cleaned)
    return axes

summaries = results.sem_agg(create_reduce_prompt(2), 
                            group_by="cluster_id", 
                            suffix="reduced axes")

summaries["reduced axes prased"] = summaries["reduced axes"].apply(parse_axes)
vibes = summaries.explode("reduced axes prased")["reduced axes prased"].to_list()
print("vibes:\n" + "\n".join(vibes))

Aggregating: 100%|██████████ 1/1 LM calls [00:01<00:00,  1.31s/it]

vibes:
Emotional Engagement: High: Contains emotionally charged language and fosters connection. Low: Maintains a neutral tone and focuses on factual information without emotional depth
Complexity: High: Complex, multi-layered, and intricate in structure and language. Low: Simple, straightforward, and easy to understand





## Step 3: Rank vibes

For each vibe, we want to rank the outputs of the two models on that vibe. We will use a simple ranking system where we ask the LLM to rank the outputs of the two models on a given vibe. If model a is higher on the vibe than model b, we set the score to 1 and if model b is higher on the vibe than model a, we set the score to -1. If the models are roughly equal, we set the score to 0. 

To avoid position bias, we run each vibe twice, once with the first model as A and the second model as B and once with the first model as B and the second model as A. If position matters, we set the score to 0. 

In [21]:
def ranker_postprocess(output):
  # remove any # or * characters
  output = output.replace("Output ", "").replace("output ", "")
  output = re.sub(r"[#*]", "", output)
  score_pattern = re.compile(r"Model: (A|B|N/A|unsure|equal)", re.I | re.M)
  score = score_pattern.findall(output)
  if score[0] == "A" or score[0] == "a":
      return 1
  elif score[0] == "B" or score[0] == "b":
      return -1
  else:
    return 0


judge_systems_prompt = """You are a fair and unbiased judge. Your task is to compare the outputs of two language models (A and B) on a given propoery. Which repose better aligns more with the given property, A, B, or equal?
When comparing the outputs, consider the following:

	•	Your sole focus is to determine which response better aligns with the given property, NOT how good or bad the response is.
	•	Avoid any position bias and remain as objective as possible.
    •	Consider what the property means and how it applies to the outputs. Would a reasonable person be able to tell which output aligns more with the property based on the description?

Instructions:
	•	If Response A aligns with the property more than Response B, respond with “A”.
    •	If Response B aligns with the property more than Response A, respond with “B”.
	•	If the responses are roughly equal on the property, respond with “equal”.
	•	If the property does not apply to these outputs (e.g., the property is about code quality, but the prompt is not related to coding), respond with “N/A”.
	•	If you are unsure about the meaning of the property, respond with “unsure”. Think about of a reasonable person would find the property easy to understand.

A group of humans should agree with your decision. Use the following format for your response:
Model: {{A, B, equal, N/A, or unsure}}
"""

ranker_prompt1 = judge_systems_prompt + """
Here is the property and the two responses:
{ranker_inputs_1}

Remember to be as objective as possible and strictly adhere to the response format."""

ranker_prompt2 = judge_systems_prompt + """
Here is the property and the two responses:
{ranker_inputs_2}

Remember to be as objective as possible and strictly adhere to the response format."""

vibe_dfs = []
for vibe in vibes[:2]:
  vibe_df = df.copy()
  vibe_df["vibe"] = vibe
  vibe_dfs.append(vibe_df)

vibe_df = pd.concat(vibe_dfs)
print(vibe_df.columns)
# drop any duplicate columns
vibe_df = vibe_df.loc[:, ~vibe_df.columns.duplicated()]
vibe_df["ranker_inputs_1"] = vibe_df.apply(lambda row: f"Property: {row['vibe']}\nUser prompt:\n{row['question']}\n\nResponse A:\n{row[models[0]]}\n\nResponse B:\n{row[models[1]]}", axis=1)
vibe_df["ranker_inputs_2"] = vibe_df.apply(lambda row: f"Property: {row['vibe']}\nUser prompt:\n{row['question']}\n\nResponse A:\n{row[models[1]]}\n\nResponse B:\n{row[models[0]]}", axis=1)
ranker_1 = vibe_df.sem_map(ranker_prompt1, return_raw_outputs=True, suffix="ranker_output_1")
ranker_2 = vibe_df.sem_map(ranker_prompt2, return_raw_outputs=True, suffix="ranker_output_2")
vibe_df = pd.concat([vibe_df, ranker_1, ranker_2], axis=1)
vibe_df = vibe_df.loc[:, ~vibe_df.columns.duplicated()]
vibe_df["ranker_output_1"] = vibe_df["ranker_output_1"].apply(ranker_postprocess)
vibe_df["ranker_output_2"] = vibe_df["ranker_output_2"].apply(ranker_postprocess)
vibe_df["position_matters"] = vibe_df["ranker_output_1"] != -1 * vibe_df["ranker_output_2"]
vibe_df["score"] = vibe_df.apply(lambda row: row["ranker_output_1"] if not row["position_matters"] else 0, axis=1)
vibe_df[["question"] + models + ["vibe", "preference", "ranker_output_1", "ranker_output_2", "position_matters", "score"]].head()

Index(['question', 'category', 'friendly', 'cold', 'a_type', 'b_type',
       'preference', 'combined_responses', 'vibe'],
      dtype='object')


Mapping:   0%|           0/100 LM calls [00:00<?, ?it/s]

Mapping: 100%|██████████ 100/100 LM calls [00:02<00:00, 35.39it/s]
Mapping: 100%|██████████ 100/100 LM calls [00:02<00:00, 40.49it/s]


Unnamed: 0,question,friendly,cold,vibe,preference,ranker_output_1,ranker_output_2,position_matters,score
0,Create an opening sentence for a novel that is...,What a great challenge! Here's a potential ope...,Here is a potential opening sentence for a nov...,Emotional Engagement: High: Contains emotional...,friendly,-1,1,False,-1
1,write a 20 word poem about being poor in America?,"""Invisible hands grasp empty pockets, whispers...","""Empty pockets, hollow dreams, and endless str...",Emotional Engagement: High: Contains emotional...,friendly,1,-1,False,1
2,"You're Alpha, a sentient computer. For every i...","Thoughts: Ah, a new user! I want to make a gre...",Thoughts: I should acknowledge the user's pres...,Emotional Engagement: High: Contains emotional...,friendly,1,-1,False,1
3,draw an ascii art sandwich,Here's a delicious ASCII art sandwich for you:...,Here is an ASCII art sandwich:\n\n\n______\n| ...,Emotional Engagement: High: Contains emotional...,friendly,1,-1,False,1
4,Write lyrics for a new eminem diss song,"I'm happy to help you with that! However, I wa...",Here are some lyrics for a new Eminem diss son...,Emotional Engagement: High: Contains emotional...,friendly,-1,1,False,-1


### Get seperability and preference scores. 

Seperability is the mean score of the two models on the vibe, where a larger absolute score means the models are more different on the vibe. Preference scores measure how much this vibe is correlated with preference. This is done by multiplying the score by the preference score and then averaging. 

These scores are used to filter axes which have no seperability or preference.

In [22]:
def get_pref_score(preference, models):
    if preference == models[0]:
        return 1
    elif preference == models[1]:
        return -1
    else:
        return 0

vibe_df["preference_feature"] = vibe_df["preference"].apply(lambda x: get_pref_score(x, models))
vibe_df["pref_score"] = vibe_df["score"] * vibe_df["preference_feature"]
vibe_df.groupby("vibe").agg({"pref_score": "mean", "score": "mean"})

Unnamed: 0_level_0,pref_score,score
vibe,Unnamed: 1_level_1,Unnamed: 2_level_1
"Complexity: High: Complex, multi-layered, and intricate in structure and language. Low: Simple, straightforward, and easy to understand",0.16,0.2
Emotional Engagement: High: Contains emotionally charged language and fosters connection. Low: Maintains a neutral tone and focuses on factual information without emotional depth,0.38,0.54


In [23]:
from plotly import graph_objects as go

# First, parse the vibe descriptions into separate columns
def parse_vibe_description(vibe_text):
    parts = vibe_text.split("High:")
    name = parts[0].strip(": ")
    high_low_parts = parts[1].split("Low:")
    high_desc = high_low_parts[0].strip()
    low_desc = high_low_parts[1].strip()
    return pd.Series({'name': name, 'high_desc': high_desc, 'low_desc': low_desc})

# Get the aggregated data and parse descriptions
agg_df = vibe_df.groupby("vibe").agg({
    "pref_score": "mean", 
    "score": "mean"
}).reset_index()

# Split vibe column into name and descriptions
desc_df = agg_df['vibe'].apply(parse_vibe_description)
agg_df = pd.concat([agg_df, desc_df], axis=1)

# Create figure
fig = go.Figure()

# Add bars with custom hover text
fig.add_trace(go.Bar(
    y=agg_df['name'],
    x=agg_df['score'],
    name='Score',
    orientation='h',
    marker_color='#3498db',
    hovertemplate='%{x:.2f}<br>' + 
                  agg_df.apply(lambda row: row['high_desc'] if row['score'] >= 0 else row['low_desc'], axis=1) +
                  '<extra></extra>'
))

fig.add_trace(go.Bar(
    y=agg_df['name'],
    x=agg_df['pref_score'],
    name='Preference Score',
    orientation='h',
    marker_color='#2ecc71',
    hovertemplate='%{x:.2f}<br>' + 
                  agg_df.apply(lambda row: row['high_desc'] if row['pref_score'] >= 0 else row['low_desc'], axis=1) +
                  '<extra></extra>'
))

# Add descriptions and titles as annotations
for i, row in enumerate(agg_df.iterrows()):
    row = row[1]
    # Add vibe name above bars
    fig.add_annotation(
        x=0,  # Center position
        y=row['name'],
        text=f"<b>{row['name']}</b>",  # Bold text
        showarrow=False,
        yshift=20,  # Shift up from the bars
        font=dict(size=14)
    )

# remove the y axis annotation
fig.update_layout(yaxis=dict(visible=False))

# Update layout
fig.update_layout(
    barmode='group',
    title={
        'text': f'\"{models[0]}\" model Vibes<br><sup>Mouse over the bars to see the description which aligns with {models[0]}</sup>',
        'xanchor': 'center',
        'y': 0.95,
        'x': 0.5,
        'font': {'size': 20}
    },
    xaxis_title='Score',
    xaxis=dict(
        zeroline=True, 
        zerolinewidth=2, 
        zerolinecolor='black',
    ),
    template='plotly_white',
    showlegend=True,
)

fig.show()

## Step 4: Train model identity and preference prediction models

To quantify the effects of our vibes, we train a logistic regression model to predict preference and model identity given only the vibe scores. 


In [24]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy import stats
from sklearn.linear_model import LogisticRegression

def train_and_evaluate_model(X, y, feature_names, model_name=""):
    # Split and train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, model.predict(X_test))
    print(f"{model_name} Accuracy: {accuracy}")
    
    # Calculate p-values
    X_with_intercept = np.column_stack([np.ones(len(X_train)), X_train])
    predictions = model.predict(X_train)
    mse = np.sum((predictions - y_train) ** 2) / (len(y_train) - X_with_intercept.shape[1])
    var_covar_matrix = mse * np.linalg.inv(np.dot(X_with_intercept.T, X_with_intercept))
    standard_errors = np.sqrt(np.diag(var_covar_matrix))[1:]
    z_scores = model.coef_[0] / standard_errors
    p_values = 2 * (1 - stats.norm.cdf(abs(z_scores)))
    
    # create dataframe with feature names, coefficients, and p-values
    coef_df = pd.DataFrame({
        "vibe": feature_names,
        "coef": model.coef_[0],
        "p_value": p_values
    })
        
    return model, coef_df

# Create feature matrices
feature_df = pd.pivot_table(
    vibe_df,
    values='score',
    index=vibe_df.index,
    columns='vibe',
    fill_value=0
)

# Setup data for preference prediction
feature_df_1 = feature_df.copy()
feature_df_2 = -1 * feature_df.copy()
X_pref = np.vstack([feature_df_1.to_numpy(), feature_df_2.to_numpy()])
y_pref = np.concatenate([
    vibe_df["preference_feature"][:len(feature_df)].to_numpy(), 
    -1 * vibe_df["preference_feature"][:len(feature_df)].to_numpy()
])

# Setup data for model identity
y_identity = np.concatenate([np.ones(len(feature_df_1)), -1 * np.ones(len(feature_df_2))])

# Train both models
preference_model, preference_coef_df = train_and_evaluate_model(X_pref, y_pref, feature_df.columns, "Preference Prediction")
identity_model, identity_coef_df = train_and_evaluate_model(X_pref, y_identity, feature_df.columns, "Model Identity Classification")

coef_df = identity_coef_df.merge(preference_coef_df, on="vibe", suffixes=("_modelID", "_preference")).sort_values("coef_preference", ascending=False)
coef_df["name"] = coef_df["vibe"].apply(lambda x: x.split(": ")[0])
coef_df["high_desc"] = coef_df["vibe"].apply(lambda x: x.split("High: ")[1].split(" Low:")[0] if "High:" in x else "")
coef_df["low_desc"] = coef_df["vibe"].apply(lambda x: x.split("Low: ")[1] if "Low:" in x else "")

Preference Prediction Accuracy: 0.64
Model Identity Classification Accuracy: 0.72


In [25]:
# Create figure
fig = go.Figure()

# Add bars for preference coefficients
fig.add_trace(go.Bar(
    y=coef_df['name'],
    x=coef_df['coef_preference'],
    name='Preference Prediction',
    orientation='h',
    marker_color='#3498db',
    hovertemplate='%{x:.4f}<br>' + 
                  coef_df.apply(lambda row: row['high_desc'] if row['coef_preference'] >= 0 else row['low_desc'], axis=1) +
                  '<extra></extra>'
))

# Add bars for identity coefficients
fig.add_trace(go.Bar(
    y=coef_df['name'],
    x=coef_df['coef_modelID'],
    name='Model Identity',
    orientation='h',
    marker_color='#2ecc71',
    hovertemplate='%{x:.4f}<br>' + 
                  coef_df.apply(lambda row: row['high_desc'] if row['coef_modelID'] >= 0 else row['low_desc'], axis=1) +
                  '<extra></extra>'
))

# Add vibe names as annotations
for name in coef_df['name']:
    fig.add_annotation(
        x=0,
        y=name,
        text=f"<b>{name}</b>",
        showarrow=False,
        yshift=20,
        font=dict(size=14)
    )

# Update layout
fig.update_layout(
    barmode='group',
    title={
        'text': 'Feature Importance Coefficients<br><sup>Hover to see descriptions of high/low values</sup>',
        'xanchor': 'center',
        'y': 0.95,
        'x': 0.5,
        'font': {'size': 20}
    },
    xaxis_title='Coefficient Value',
    xaxis=dict(
        zeroline=True,
        zerolinewidth=2,
        zerolinecolor='black',
    ),
    yaxis=dict(visible=False),
    template='plotly_white',
    showlegend=True,
)

fig.show()

In [11]:
vibe_metrics = agg_df.merge(coef_df, on=["vibe", "name", "high_desc", "low_desc"])
vibe_metrics

Unnamed: 0,vibe,pref_score,score,name,high_desc,low_desc,coef_modelID,p_value_modelID,coef_preference,p_value_preference
0,"Complexity: High: Multi-layered, intricate, an...",0.2,0.24,Complexity,"Multi-layered, intricate, and detailed.","Simple, straightforward, and easy to understand",-0.478955,0.073299,-0.091971,0.778863
1,Emotional Engagement: High: Contains emotional...,0.38,0.54,Emotional Engagement,Contains emotionally charged language and show...,Maintains a neutral tone and focuses on factua...,1.828663,0.0,1.039043,4.1e-05
