In [96]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch
import numpy as np
import json
from tqdm import tqdm

In [97]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [98]:
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA

def convexhull(x, n_components):
    pca = PCA(n_components = n_components)
    reduced_data = pca.fit_transform(x)
    explained_ratio = pca.explained_variance_ratio_
    hull = ConvexHull(reduced_data)
    volume = hull.volume
    return volume, explained_ratio

In [99]:
def percent_change(values):
    lst = []
    for i in range(len(values)):
        percent_change = ((values[i] - values[6])/values[6])*100
        lst.append(round(percent_change))
    return lst

In [100]:
import tensorflow as tf
import numpy as np

def DPP_diversity(x, lambda0=0.1):
    x = tf.convert_to_tensor(x, dtype='float32')
    
    r = tf.reduce_sum(tf.math.square(x), axis =1, keepdims = True)
    D = r-2*tf.matmul(x, tf.transpose(x))+tf.transpose(r)
    S = tf.exp(-0.5*tf.math.square(D))
    y = tf.ones(np.shape(x)[0])
    Q = tf.tensordot(tf.expand_dims(y, 1), tf.expand_dims(y, 0), 1)
    if lambda0 == 0:
        L = S
    else:
        L= S*tf.math.pow(Q, lambda0)
    try:
        eig_val, _  = tf.linalg.eigh(L)
    except:
        eig_val = tf.ones_like(y)
    loss = -tf.reduce_mean(tf.math.log(tf.math.maximum(eig_val, 1e-7)))
    return loss.numpy()

In [101]:
def distance_to_centroid(embeddings):
    distances = []
    for i in range(embeddings.shape[0]):
        pca = PCA(n_components = 20)
        embeddings = pca.fit_transform(embeddings)
        mean = np.mean(embeddings[i])
        dist = np.sqrt(np.sum(np.square(np.subtract(embeddings[i], mean))))
        distances.append(dist)
    return np.mean(np.array(distances))

In [102]:
def L2_vectorized(X, Y):
    #Vectorize L2 calculation using x^2+y^2-2xy
    X_sq = np.sum(np.square(X), axis=1)
    Y_sq = np.sum(np.square(Y), axis=1)
    sq = np.add(np.expand_dims(X_sq, axis=-1), np.transpose(Y_sq)) - 2*np.matmul(X,np.transpose(Y))
    sq = np.clip(sq, 0.0, 1e12)
    return np.sqrt(sq)

In [103]:
def calc_distance(X, Y, distance="Euclidean"):
    if distance=="Euclidean":
        return L2_vectorized(X,Y)
    else:
        raise Exception("Unknown distance metric specified")

In [104]:
def gen_gen_distance(embeddings, reduction):
    x = embeddings
    res = calc_distance(x, x, distance = "Euclidean")
    # this sets the diagonal of the matrix to the maximum of elements across the column dimension (axis = 1)
    res = tf.linalg.set_diag(res, tf.reduce_max(res, axis=1))
    # pick the smallest values along the columns
    if reduction == "min":
        scores = tf.reduce_min(res, axis=1)
    # pick the average value along the columns
    elif reduction == "ave":
        scores = tf.reduce_mean(res, axis=1)
    else:
        raise Exception("Unknown reduction method")
    return np.mean(scores.numpy())

## Using critique method

## RQ2:

(1b) How does styling of the input prompt impact the output quality and diversity of the prompt?

In [105]:
# List of CSV files
csv_files = [
    'data/critique_towels.csv',
    'data/critique_powder.csv',
    'data/critique_time.csv',
    'data/critique_exercise.csv',
    'data/critique_froth.csv',
]

In [106]:
# this only needs to be run once.
dict_1_DPP = {}
dict_1_convex = {}
dict_1_centroid = {}
dict_1_nearest = {}
for csv_file in tqdm(csv_files):
    df = pd.read_csv(csv_file)
    count = 0
    for column in tqdm(df.columns):
        # encode the column text data into embeddings
        embeddings = model.encode(df[column].astype(str).tolist())
        # calculate the DPP
        dict_1_DPP[(csv_file, count)] = DPP_diversity(embeddings, lambda0=0.1)
        # calculate the convex hull
        dict_1_convex[(csv_file, count)] = convexhull(embeddings, n_components = 13)
        # calculate the distance to centroid
        dict_1_centroid[(csv_file, count)] = distance_to_centroid(embeddings)
        # calculate the nearest generated distance (average)
        dict_1_nearest[(csv_file, count)] = gen_gen_distance(embeddings, reduction = "ave")
        count += 1

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:28<00:00, 28.28s/it][A
 20%|██        | 1/5 [00:28<01:53, 28.28s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:23<00:00, 23.14s/it][A
 40%|████      | 2/5 [00:51<01:15, 25.26s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:22<00:00, 22.00s/it][A
 60%|██████    | 3/5 [01:13<00:47, 23.77s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:27<00:00, 27.20s/it][A
 80%|████████  | 4/5 [01:40<00:25, 25.13s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:23<00:00, 23.37s/it][A
100%|██████████| 5/5 [02:04<00:00, 24.80s/it]


In [107]:
# DPP

# convert tuples in dictionary to strings
dict_1_DPP_str = {str(key): value for key, value in dict_1_DPP.items()}

# convert float32 values to float
dict_1_DPP_str = {key:float(value) for key, value in dict_1_DPP_str.items()}

with open("data/DPP_Topics.json", "w") as file:
    json.dump(dict_1_DPP_str, file)
    
# Centroid Distance
dict_1_centroid_str = {str(key): value for key, value in dict_1_centroid.items()}

dict_1_centroid_str = {key:float(value) for key, value in dict_1_centroid_str.items()}

with open("data/centroid_Topics.json", "w") as file:
    json.dump(dict_1_centroid_str, file)
    
# Nearest Generated Sample
dict_1_nearest_str = {str(key): value for key, value in dict_1_nearest.items()}

dict_1_nearest_str = {key:float(value) for key, value in dict_1_nearest_str.items()}

with open("data/nearest_Topics.json", "w") as file:
    json.dump(dict_1_nearest_str, file)

In [108]:
# Convex Hull
def convert_to_json(obj):
    if isinstance(obj, tuple):
        return {'__tuple__': True, 'items': list(obj)}
    elif isinstance(obj, np.ndarray):
        return {'__ndarray__': True, 'n_component': obj.tolist()}
    return obj


dict_1_convex_str = {str(key): value for key, value in dict_1_convex.items()}

with open("data/convex_Topics.json", "w") as file:
    json.dump(dict_1_convex_str, file, default = convert_to_json)

In [109]:
# to retrieve the json file
with open("data/DPP_Topics.json", "r") as file:
    dict_1_DPP_json = json.load(file)
dict_1_DPP_json = {eval(key): value for key, value in dict_1_DPP_json.items()}

with open("data/centroid_Topics.json", "r") as file:
    dict_1_centroid_json = json.load(file)
dict_1_centroid_json = {eval(key): value for key, value in dict_1_centroid_json.items()}

with open("data/nearest_Topics.json", "r") as file:
    dict_1_nearest_json = json.load(file)
dict_1_nearest_json = {eval(key): value for key, value in dict_1_nearest_json.items()}

In [110]:
# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("data/convex_Topics.json", "r") as file:
    dict_1_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_1_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_1_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_1_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_1_convex_json.items()}

In [125]:
# load the previous json files as dict
with open("../tempTopP-Adj/data/DPP_Topics.json", "r") as file:
    dict_2_DPP_json = json.load(file)
dict_2_DPP_json = {eval(key): value for key, value in dict_2_DPP_json.items()}

with open("../tempTopP-Adj/data/centroid_Topics.json", "r") as file:
    dict_2_centroid_json = json.load(file)
dict_2_centroid_json = {eval(key): value for key, value in dict_2_centroid_json.items()}

with open("../tempTopP-Adj/data/nearest_Topics.json", "r") as file:
    dict_2_nearest_json = json.load(file)
dict_2_nearest_json = {eval(key): value for key, value in dict_2_nearest_json.items()}

# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("../tempTopP-Adj/data/convex_Topics.json", "r") as file:
    dict_2_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_2_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_2_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_2_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_2_convex_json.items()}


In [115]:
# DPP percent difference calculation
percent_diff = {}
for key, value in dict_2_DPP_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

In [117]:
# append values
percent_diff['data/ablation_topic_towels.csv'].append(dict_1_DPP_json[('data/critique_towels.csv', 0)])
percent_diff['data/ablation_topic_powder.csv'].append(dict_1_DPP_json[('data/critique_powder.csv', 0)])
percent_diff['data/ablation_topic_time.csv'].append(dict_1_DPP_json[('data/critique_time.csv', 0)])
percent_diff['data/ablation_topic_exercise.csv'].append(dict_1_DPP_json[('data/critique_exercise.csv', 0)])
percent_diff['data/ablation_topic_froth.csv'].append(dict_1_DPP_json[('data/critique_froth.csv', 0)])

In [119]:

print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique

Percent difference for data/ablation_topic_towels.csv:[17, -1, 7, 23, -8, -3, 0, -8]
Percent difference for data/ablation_topic_powder.csv:[10, 37, 32, 32, 10, 6, 0, -16]
Percent difference for data/ablation_topic_time.csv:[51, 16, 31, 58, 71, -2, 0, 31]
Percent difference for data/ablation_topic_exercise.csv:[30, 21, 5, 41, 13, 7, 0, -15]
Percent difference for data/ablation_topic_froth.csv:[8, 11, 1, -13, -1, 1, 0, -19]


In [123]:
# Nearest percent difference calculation
percent_diff = {}
for key, value in dict_2_nearest_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

# append values
percent_diff['data/ablation_topic_towels.csv'].append(dict_1_nearest_json[('data/critique_towels.csv', 0)])
percent_diff['data/ablation_topic_powder.csv'].append(dict_1_nearest_json[('data/critique_powder.csv', 0)])
percent_diff['data/ablation_topic_time.csv'].append(dict_1_nearest_json[('data/critique_time.csv', 0)])
percent_diff['data/ablation_topic_exercise.csv'].append(dict_1_nearest_json[('data/critique_exercise.csv', 0)])
percent_diff['data/ablation_topic_froth.csv'].append(dict_1_nearest_json[('data/critique_froth.csv', 0)])


print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique

Percent difference for data/ablation_topic_towels.csv:[-24, -15, -17, -24, -10, 3, 0, -8]
Percent difference for data/ablation_topic_powder.csv:[-7, -11, -10, -10, -7, 0, 0, 1]
Percent difference for data/ablation_topic_time.csv:[-16, -9, -11, -11, -16, 1, 0, -13]
Percent difference for data/ablation_topic_exercise.csv:[-4, -4, -2, -5, -4, -2, 0, -2]
Percent difference for data/ablation_topic_froth.csv:[-7, -8, -7, -9, -7, -2, 0, -14]


In [131]:
# Convex Hull percent difference calculation
percent_diff = {}
for key, value in dict_2_convex_json.items():
    key = eval(key)
    csv_file = key[0]
    value1 = value[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value1)

# append values
percent_diff['data/ablation_topic_towels.csv'].append(dict_1_convex_json["('data/critique_towels.csv', 0)"][0])
percent_diff['data/ablation_topic_powder.csv'].append(dict_1_convex_json["('data/critique_powder.csv', 0)"][0])
percent_diff['data/ablation_topic_time.csv'].append(dict_1_convex_json["('data/critique_time.csv', 0)"][0])
percent_diff['data/ablation_topic_exercise.csv'].append(dict_1_convex_json["('data/critique_exercise.csv', 0)"][0])
percent_diff['data/ablation_topic_froth.csv'].append(dict_1_convex_json["('data/critique_froth.csv', 0)"][0])


print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique

Percent difference for data/ablation_topic_towels.csv:[-98, -94, -95, -98, -84, 26, 0, -82]
Percent difference for data/ablation_topic_powder.csv:[-72, -84, -77, -80, -77, 32, 0, -27]
Percent difference for data/ablation_topic_time.csv:[-95, -84, -86, -86, -95, -12, 0, -93]
Percent difference for data/ablation_topic_exercise.csv:[-46, -46, -45, -55, -54, -19, 0, -54]
Percent difference for data/ablation_topic_froth.csv:[-86, -83, -86, -65, -88, -33, 0, -83]


In [134]:
# Centroid Distance percent difference calculation
percent_diff = {}
for key, value in dict_2_centroid_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

# append values
percent_diff['data/ablation_topic_towels.csv'].append(dict_1_centroid_json[('data/critique_towels.csv', 0)])
percent_diff['data/ablation_topic_powder.csv'].append(dict_1_centroid_json[('data/critique_powder.csv', 0)])
percent_diff['data/ablation_topic_time.csv'].append(dict_1_centroid_json[('data/critique_time.csv', 0)])
percent_diff['data/ablation_topic_exercise.csv'].append(dict_1_centroid_json[('data/critique_exercise.csv', 0)])
percent_diff['data/ablation_topic_froth.csv'].append(dict_1_centroid_json[('data/critique_froth.csv', 0)])


print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2, critique

Percent difference for data/ablation_topic_towels.csv:[-27, -19, -20, -27, -13, 2, 0, -11]
Percent difference for data/ablation_topic_powder.csv:[-8, -10, -10, -11, -8, 1, 0, 1]
Percent difference for data/ablation_topic_time.csv:[-17, -10, -11, -11, -16, 1, 0, -15]
Percent difference for data/ablation_topic_exercise.csv:[-3, -4, -2, -4, -3, -1, 0, -3]
Percent difference for data/ablation_topic_froth.csv:[-7, -8, -8, -15, -8, -2, 0, -18]
