In [1]:
from functools import partial
import torch
from torch.nn import CosineSimilarity
import numpy as np

# import metrics from https://github.com/CarperAI/diversity_metrics/tree/main

from sentence_transformers import SentenceTransformer
from diversity_metrics.metrics.model_free_metrics import *
from diversity_metrics.embeddings.models import *
from diversity_metrics.metrics.generalized_diversity import *

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [2]:
def limit_num_words(sentence, max_num_words):
    return " ".join(sentence.split()[:max_num_words])

In [3]:
# import combinations
from itertools import combinations
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def average_pairwise_jaccard(sentences, n=2):
    return np.mean([pairwise_ngram(n, x, y) for x, y in combinations(sentences, 2)])

def self_bleu_smooth(sentences):
    '''
    Calculates the Self-BLEU score for a collection of generated examples (https://arxiv.org/abs/1802.01886)
    :param sentences: List of generated examples
    :return:
    '''

    scores = []
    for i, hypothesis in enumerate(sentences):
        hypothesis_split = hypothesis.strip().split()

        references = [sentences[j].strip().split() for j in range(len(sentences)) if i != j]

        scores.append(sentence_bleu(references, hypothesis_split, smoothing_function=SmoothingFunction().method1))

    return sum(scores) / len(scores)

In [4]:
diversity_metrics = {"selfBleuSmoothed": self_bleu_smooth,
                    "average_pairwise_ncd": lambda sentences: np.mean(get_pairwise_ncd(sentences)),
                     "average_pairwise_jaccard_2": partial(average_pairwise_jaccard, n=2),
                    "average_pairwise_jaccard_3": partial(average_pairwise_jaccard, n=3),
                    "average_pairwise_jaccard_4": partial(average_pairwise_jaccard, n=4),
                    "avg_compression_ratio_full": avg_compression_ratio_full,
                    "avg_compression_ratio_target": avg_compression_ratio_target,
                    "cosine_similarity": None # will be filled in later
                    }

pairwise_similarities = {"jaccard_2": partial(pairwise_ngram, 2), "cosine": None}
qs = [2,3,4,5,6]
num_samples = 50
def generate_metric_order(q, metric):
    return lambda sentences: diversity_order_q(sentences, q, metric, num_samples) # careful about lambda in loops
for q in qs:
    for key, metric in pairwise_similarities.items():
        diversity_metrics[f"order_{q}_{key}"] = generate_metric_order(q, metric)

from vendi_score import vendi

def generate_metric_vendi(metric):
    return lambda sentences: vendi.score(sentences, metric)
for key, metric in pairwise_similarities.items():
    diversity_metrics[f"vendi_{key}"] = generate_metric_vendi(metric)

In [5]:
def compute_metric(key, data, prompt_type):
    if "cosine" in key:
        sent_embedder = SBERTEmbedder()
        sent_embedder.model = sent_embedder.model.to(device)
        metric_func = partial(get_pairwise_cosine_sim, sent_embedder)
        if key == "cosine_similarity":
            metric = partial(get_avg_cosine_sim, sent_embedder)
        elif key.startswith("order_") and "cosine" in key:
            q = int(key.split("_")[1])
            metric_func = partial(get_pairwise_cosine_sim, sent_embedder)
            metric = lambda sentences: diversity_order_q(sentences, q, metric_func, num_samples)
        elif key.startswith("vendi_") and "cosine" in key:
            metric_func = partial(get_pairwise_cosine_sim, sent_embedder)
            metric = lambda sentences: vendi.score(sentences, metric_func)
    else:
        metric = diversity_metrics[key]

    result = np.zeros_like(data, dtype=np.float32)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            for k in range(data.shape[2]):
                try:
                    result[i, j, k] = metric(data[i, j, k])
                except:
                    result[i, j, k] = np.nan
                print(result[i, j, k])
    return key, result, prompt_type

In [7]:
completions_creative = np.load("rlhf_experiments/completions_creative_max_length70.npy", allow_pickle=True)
completions_factual = np.load("rlhf_experiments/completions_factual_max_length70.npy", allow_pickle=True)

In [11]:
completions_creative

array([[[list([" Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nA soothing melody for you to please,\nA world of wonder, full of grace,\nA place where love and joy embrace.\n\nI hope you find this poem to", " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nWhispers through the leaves with ease.\nA soothing melody, a symphony,\nNature's beauty, for all to see.\nThe sun shines bright,", " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nA soothing melody for all to see,\nA world of wonder, full of grace,\nA place where love and joy embrace.\n\nI hope you find this poem to", " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nWhispers through the leaves with ease.\nA soothing melody, a symphony,\nNature's harmony, for all to see.\nThe sun shines bright", " Of course, I'd be happy to write a poem for you! H

In [21]:
len(completions_creative) # 15 temperatures
len(completions_creative[0]) # 20 prompts
len(completions_creative[0][0]) # 2 models
len(completions_creative[0][0][0]) # 25
completions_creative[0][0][0] # n_generations

[" Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nA soothing melody for you to please,\nA world of wonder, full of grace,\nA place where love and joy embrace.\n\nI hope you find this poem to",
 " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nWhispers through the leaves with ease.\nA soothing melody, a symphony,\nNature's beauty, for all to see.\nThe sun shines bright,",
 " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nA soothing melody for all to see,\nA world of wonder, full of grace,\nA place where love and joy embrace.\n\nI hope you find this poem to",
 " Of course, I'd be happy to write a poem for you! Here it is:\nA gentle breeze upon the trees,\nWhispers through the leaves with ease.\nA soothing melody, a symphony,\nNature's harmony, for all to see.\nThe sun shines bright",
 " Of course, I'd be happy to write a poem for you! Here it is:

In [None]:
# import submitit
import numpy as np
from functools import partial

# Define the tasks that require a GPU
# gpu_required_tasks = {key for key in diversity_metrics.keys() if "cosine" in key}

# Define your submitit executor with base parameters
# executor = submitit.AutoExecutor(folder="submitit_jobs/")
# executor.update_parameters(
#     timeout_min=60,  # Set to the max runtime of your job
#     slurm_partition="parietal,normal,gpu",  # Specify a default partition
# )

max_num_words = 20

# Split work and submit to executor
jobs = []
for key, metric in diversity_metrics.items():
    for data_full in [completions_creative, completions_factual]:
        # truncate the sentences to max_num_words
        data = np.zeros_like(data_full, dtype=object)
        for i in range(data_full.shape[0]):
            for j in range(data_full.shape[1]):
                for k in range(data_full.shape[2]):
                    data[i, j, k] = list(map(lambda sentence: limit_num_words(sentence, max_num_words), data_full[i, j, k]))
        # if key in gpu_required_tasks:
        #     executor.update_parameters(gpus_per_node=1)  # Update GPU setting only for this specific job
        # else:
        #     executor.update_parameters(gpus_per_node=0)#, exclude=",".join(f"margpu{k:03d}" for k in range(2, 11)))
        
        job = executor.submit(compute_metric, key, data, "creative" if data_full is completions_creative else "factual")
        jobs.append(job)

# Gather results
results = {"creative": {}, "factual": {}}
for job in jobs:
    metric_name, result_data, data_type = job.result()
    results[data_type][metric_name] = result_data

#np.save("results_20_words.npy", results)
#np.save("results_5_words.npy", results)

In [9]:
results_20_words = np.load("rlhf_experiments/results_20_words.npy", allow_pickle=True).item()
results_5_words = np.load("rlhf_experiments/results_5_words.npy", allow_pickle=True).item()

In [10]:
results_20_words

{'creative': {'selfBleuSmoothed': array([[[1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [0.9939693 , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [0.9673025 , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ]],
  
         [[0.97638774, 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [1.        , 1.        ],
          [0.9932216 , 1.        ],
          [0.9801943 , 1.        ],
          [1.        , 1.   

## Plot diversity metrics

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt
import numpy as np



num_metrics = len(diversity_metrics.keys())
fig, axes = plt.subplots(num_metrics, 2, figsize=(20, 5*num_metrics))
# make font bigger
plt.rcParams.update({'font.size': 16})

text_effect = [path_effects.withStroke(linewidth=3, foreground='white')]

for index, metric in enumerate(diversity_metrics.keys()):

    for col_idx, result in enumerate([results_20_words, results_5_words]):
        ax = axes[index, col_idx]

        scores_factual = result["factual"][metric]
        scores_creative = result["creative"][metric]

        # Plotting for factual scores
        ax.plot(temperatures, scores_factual[:, :, 0].mean(axis=1), label='llama2-chat', marker='o', color="blue")
        ax.fill_between(temperatures, scores_factual[:, :, 0].mean(axis=1) - 2 * scores_factual[:, :, 0].std(axis=1) / np.sqrt(scores_factual.shape[1]), 
                        scores_factual[:, :, 0].mean(axis=1) + 2 * scores_factual[:, :, 0].std(axis=1) / np.sqrt(scores_factual.shape[1]), alpha=0.2, color="blue")

        ax.plot(temperatures, scores_factual[:, :, 1].mean(axis=1), label='Vicuna1.5', marker='o', color="orange")
        ax.fill_between(temperatures, scores_factual[:, :, 1].mean(axis=1) - 2 * scores_factual[:, :, 1].std(axis=1) / np.sqrt(scores_factual.shape[1]), 
                        scores_factual[:, :, 1].mean(axis=1) + 2 * scores_factual[:, :, 1].std(axis=1) / np.sqrt(scores_factual.shape[1]), alpha=0.2, color="orange")

        # Plotting for creative scores
        ax.plot(temperatures, scores_creative[:, :, 0].mean(axis=1), label='llama2-chat creative', marker='o', linestyle='--', color="blue")
        ax.fill_between(temperatures, scores_creative[:, :, 0].mean(axis=1) - 2 * scores_creative[:, :, 0].std(axis=1) / np.sqrt(scores_creative.shape[1]), 
                        scores_creative[:, :, 0].mean(axis=1) + 2 * scores_creative[:, :, 0].std(axis=1) / np.sqrt(scores_creative.shape[1]), alpha=0.2, color="blue")

        ax.plot(temperatures, scores_creative[:, :, 1].mean(axis=1), label='Vicuna1.5 creative', marker='o', linestyle='--', color="orange")
        ax.fill_between(temperatures, scores_creative[:, :, 1].mean(axis=1) - 2 * scores_creative[:, :, 1].std(axis=1) / np.sqrt(scores_creative.shape[1]), 
                        scores_creative[:, :, 1].mean(axis=1) + 2 * scores_creative[:, :, 1].std(axis=1) / np.sqrt(scores_creative.shape[1]), alpha=0.2, color="orange")

        ax.set_xlabel('temperature')
        ax.set_ylabel(metric)
        ax.set_title(f'Diversity metric: {metric} for {20 if col_idx == 0 else 5} max words')
        #ax.legend()
        text_effect = [path_effects.withStroke(linewidth=3, foreground='white')]

        # Add labels directly on the plot with white outline
        end_temp = temperatures[-4]
        text_factual1 = ax.text(end_temp, scores_factual[:, :, 0].mean(axis=1)[-4], 'llama2-chat factual', color='blue', verticalalignment='bottom')
        text_factual1.set_path_effects(text_effect)

        text_factual2 = ax.text(end_temp, scores_factual[:, :, 1].mean(axis=1)[-4], 'Vicuna1.5 factual', color='orange', verticalalignment='top')
        text_factual2.set_path_effects(text_effect)

        text_creative1 = ax.text(end_temp, scores_creative[:, :, 0].mean(axis=1)[-4], 'llama2-chat creative', color='blue', verticalalignment='top', style='italic')
        text_creative1.set_path_effects(text_effect)

        text_creative2 = ax.text(end_temp, scores_creative[:, :, 1].mean(axis=1)[-4], 'Vicuna1.5 creative', color='orange', verticalalignment='bottom', style='italic')
        text_creative2.set_path_effects(text_effect)

plt.tight_layout()
plt.show()

## How does the difference in diversity between creative and factual changes with temperature?

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt
import numpy as np

num_metrics = len(diversity_metrics.keys())
fig, axes = plt.subplots(num_metrics, 2, figsize=(20, 5*num_metrics))
# make font bigger
plt.rcParams.update({'font.size': 16})
text_effect = [path_effects.withStroke(linewidth=3, foreground='white')]

for index, metric in enumerate(diversity_metrics.keys()):

    for col_idx, result in enumerate([results_20_words, results_5_words]):
        ax = axes[index, col_idx]

        scores_factual = result["factual"][metric]
        scores_creative = result["creative"][metric]

        # Calculate differences
        diff_llamaa = scores_creative[:, :, 0] - scores_factual[:, :, 0]
        diff_vicuna = scores_creative[:, :, 1] - scores_factual[:, :, 1]

        # Plotting differences
        ax.plot(temperatures, diff_llamaa.mean(axis=1), label='llama2-chat', marker='o', color="blue")
        ax.fill_between(temperatures, diff_llamaa.mean(axis=1) - 2 * diff_llamaa.std(axis=1) / np.sqrt(diff_llamaa.shape[1]), 
                        diff_llamaa.mean(axis=1) + 2 * diff_llamaa.std(axis=1) / np.sqrt(diff_llamaa.shape[1]), alpha=0.2, color="blue")

        ax.plot(temperatures, diff_vicuna.mean(axis=1), label='Vicuna1.5', marker='o', color="orange")
        ax.fill_between(temperatures, diff_vicuna.mean(axis=1) - 2 * diff_vicuna.std(axis=1) / np.sqrt(diff_vicuna.shape[1]), 
                        diff_vicuna.mean(axis=1) + 2 * diff_vicuna.std(axis=1) / np.sqrt(diff_vicuna.shape[1]), alpha=0.2, color="orange")

        end_temp = temperatures[-4]
        text_llama = ax.text(end_temp, diff_llamaa.mean(axis=1)[-4], 'llama2-chat', color='blue', verticalalignment='bottom')
        text_llama.set_path_effects(text_effect)

        text_vicuna = ax.text(end_temp, diff_vicuna.mean(axis=1)[-4], 'Vicuna1.5', color='orange', verticalalignment='top')
        text_vicuna.set_path_effects(text_effect)

        ax.set_title(f"Metric: {metric} for Result {20 if col_idx == 0 else 5} max words")
        ax.set_xlabel("Temperature")
        ax.set_ylabel(f"Creative {metric} - Factual {metric}")

plt.tight_layout()
plt.show()