In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA
def convexhull(x, n_components):
    pca = PCA(n_components = n_components)
    reduced_data = pca.fit_transform(x)
    explained_ratio = pca.explained_variance_ratio_
    hull = ConvexHull(reduced_data)
    volume = hull.volume
    return volume, explained_ratio

In [8]:
def percent_change(values):
    lst = []
    for i in range(len(values)):
        percent_change = ((values[i] - values[0])/values[0])*100
        lst.append(round(percent_change))
    return lst

In [9]:
import tensorflow as tf
import numpy as np
def DPP_diversity(x, lambda0=0.1):
    x = tf.convert_to_tensor(x, dtype='float32')
    
    r = tf.reduce_sum(tf.math.square(x), axis =1, keepdims = True)
    D = r-2*tf.matmul(x, tf.transpose(x))+tf.transpose(r)
    S = tf.exp(-0.5*tf.math.square(D))
    y = tf.ones(np.shape(x)[0])
    Q = tf.tensordot(tf.expand_dims(y, 1), tf.expand_dims(y, 0), 1)
    if lambda0 == 0:
        L = S
    else:
        L= S*tf.math.pow(Q, lambda0)
    try:
        eig_val, _  = tf.linalg.eigh(L)
    except:
        eig_val = tf.ones_like(y)
    loss = -tf.reduce_mean(tf.math.log(tf.math.maximum(eig_val, 1e-7)))
    return loss.numpy()

In [10]:
def distance_to_centroid(embeddings):
    distances = []
    for i in range(embeddings.shape[0]):
        pca = PCA(n_components = 20)
        embeddings = pca.fit_transform(embeddings)
        mean = np.mean(embeddings[i])
        dist = np.sqrt(np.sum(np.square(np.subtract(embeddings[i], mean))))
        distances.append(dist)
    return np.mean(np.array(distances))

In [11]:
def L2_vectorized(X, Y):
    #Vectorize L2 calculation using x^2+y^2-2xy
    X_sq = np.sum(np.square(X), axis=1)
    Y_sq = np.sum(np.square(Y), axis=1)
    sq = np.add(np.expand_dims(X_sq, axis=-1), np.transpose(Y_sq)) - 2*np.matmul(X,np.transpose(Y))
    sq = np.clip(sq, 0.0, 1e12)
    return np.sqrt(sq)

In [12]:
def calc_distance(X, Y, distance="Euclidean"):
    if distance=="Euclidean":
        return L2_vectorized(X,Y)
    else:
        raise Exception("Unknown distance metric specified")

In [13]:
def gen_gen_distance(embeddings, reduction):
    x = embeddings
    res = calc_distance(x, x, distance = "Euclidean")
    # this sets the diagonal of the matrix to the maximum of elements across the column dimension (axis = 1)
    res = tf.linalg.set_diag(res, tf.reduce_max(res, axis=1))
    # pick the smallest values along the columns
    if reduction == "min":
        scores = tf.reduce_min(res, axis=1)
    # pick the average value along the columns
    elif reduction == "ave":
        scores = tf.reduce_mean(res, axis=1)
    else:
        raise Exception("Unknown reduction method")
    return np.mean(scores.numpy())

## RQ1:

(1a) How do parameters such as temperature and Top P affect the quality and diversity of the generated text output?

In [16]:
# List of CSV files
csv_files = [
    'data/ablation_tempTopP_froth.csv',
    'data/ablation_tempTopP_towels.csv',
    'data/ablation_tempTopP_time.csv',
    'data/ablation_tempTopP_powder.csv',
    'data/ablation_tempTopP_exercise.csv'
]

In [19]:
# this only needs to be ran once.
dict_1_DPP = {}
dict_1_convex = {}
dict_1_centroid = {}
dict_1_nearest = {}
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    count = 0
    for column in df.columns:
        # encode the column text data into embeddings
        embeddings = model.encode(df[column].astype(str).tolist())
        # calculate the DPP
        dict_1_DPP[(csv_file, count)] = DPP_diversity(embeddings, lambda0=0.1)
        # calculate the convex hull
        dict_1_convex[(csv_file, count)] = convexhull(embeddings, n_components = 13)
        # calculate the distance to centroid
        dict_1_centroid[(csv_file, count)] = distance_to_centroid(embeddings)
        # calculate the nearest generated distance (average)
        dict_1_nearest[(csv_file, count)] = gen_gen_distance(embeddings, reduction = "ave")
        count += 1
        

In [41]:
# save the dictionary results into a json file
import json
# DPP
# convert tuples in dictionary to strings
dict_1_DPP_str = {str(key): value for key, value in dict_1_DPP.items()}

# convert float32 values to float
dict_1_DPP_str = {key:float(value) for key, value in dict_1_DPP_str.items()}

with open("DPP_TempTopP.json", "w") as file:
    json.dump(dict_1_DPP_str, file)
    
# Centroid Distance
dict_1_centroid_str = {str(key): value for key, value in dict_1_centroid.items()}

dict_1_centroid_str = {key:float(value) for key, value in dict_1_centroid_str.items()}

with open("centroid_TempTopP.json", "w") as file:
    json.dump(dict_1_centroid_str, file)
    
# Nearest Generated Sample
dict_1_nearest_str = {str(key): value for key, value in dict_1_nearest.items()}

dict_1_nearest_str = {key:float(value) for key, value in dict_1_nearest_str.items()}

with open("nearest_TempTopP.json", "w") as file:
    json.dump(dict_1_nearest_str, file)

In [27]:
# to retrieve the json file
with open("DPP_TempTopP.json", "r") as file:
    dict_1_DPP_json = json.load(file)
dict_1_DPP_json = {eval(key): value for key, value in dict_1_DPP_json.items()}

with open("centroid_TempTopP.json", "r") as file:
    dict_1_centroid_json = json.load(file)
dict_1_centroid_json = {eval(key): value for key, value in dict_1_centroid_json.items()}

with open("nearest_TempTopP.json", "r") as file:
    dict_1_nearest_json = json.load(file)
dict_1_nearest_json = {eval(key): value for key, value in dict_1_nearest_json.items()}

In [60]:
# Convex hull is a bit trickier...

# Convert tuples and arrays to compatible format
# custom conversion function that converts tuples to a dictionary with a special key and arrays
# to a dictionary with a special key
def convert_to_json(obj):
    if isinstance(obj, tuple):
        return {'__tuple__': True, 'items': list(obj)}
    elif isinstance(obj, np.ndarray):
        return {'__ndarray__': True, 'n_component': obj.tolist()}
    return obj


dict_1_convex_str = {str(key): value for key, value in dict_1_convex.items()}

with open("convex_TempTopP.json", "w") as file:
    json.dump(dict_1_convex_str, file, default = convert_to_json)

In [28]:
# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("convex_TempTopP.json", "r") as file:
    dict_1_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_1_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_1_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_1_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_1_convex_json.items()}

In [94]:
# DPP percent difference calculation
percent_diff = {}
for key, value in dict_1_DPP_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for ablation_tempTopP_froth.csv:[0, 6, 3, 3, -14, -17, 7, -4, -25, -26]
Percent difference for ablation_tempTopP_towels.csv:[0, -9, 8, -1, -1, -17, -2, -11, -31, -29]
Percent difference for ablation_tempTopP_time.csv:[0, -5, 13, -3, 6, -21, 19, 10, -49, -48]
Percent difference for ablation_tempTopP_powder.csv:[0, -5, 5, -10, 0, -31, -13, 6, -33, -37]
Percent difference for ablation_tempTopP_exercise.csv:[0, 5, -6, -12, -18, -26, -6, -17, -39, -43]


In [96]:
# Nearest Generated Difference percent difference calculation
percent_diff = {}
for key, value in dict_1_nearest_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for ablation_tempTopP_froth.csv:[0, -1, -1, -4, -2, 8, 4, -4, 20, 23]
Percent difference for ablation_tempTopP_towels.csv:[0, -1, -16, 3, 8, 23, 5, 11, 65, 61]
Percent difference for ablation_tempTopP_time.csv:[0, 0, -6, -5, -11, -2, -19, -9, 18, 16]
Percent difference for ablation_tempTopP_powder.csv:[0, 10, -2, 12, 8, 25, 13, -11, 35, 35]
Percent difference for ablation_tempTopP_exercise.csv:[0, 1, 2, 5, 4, 5, 2, 4, 7, 10]


In [114]:
# Convex Hull percent difference calculation
percent_diff = {}
for key, value in dict_1_convex_json.items():
    key = eval(key)
    csv_file = key[0]
    value1 = value[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value1)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for ablation_tempTopP_froth.csv:[0, 12, 1, -34, -43, 109, 141, -45, 1097, 1704]
Percent difference for ablation_tempTopP_towels.csv:[0, 13, -84, 105, 234, 1438, 71, 280, 124959, 99300]
Percent difference for ablation_tempTopP_time.csv:[0, 21, -50, -45, -73, -35, -90, -60, 1110, 1280]
Percent difference for ablation_tempTopP_powder.csv:[0, 221, -1, 465, 278, 2396, 482, -69, 11808, 8956]
Percent difference for ablation_tempTopP_exercise.csv:[0, 25, 28, 146, 66, 111, 32, 46, 215, 290]


In [97]:
# Centroid Distance percent difference calculation
percent_diff = {}
for key, value in dict_1_centroid_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for ablation_tempTopP_froth.csv:[0, -1, -1, -4, -5, 6, 6, -5, 17, 21]
Percent difference for ablation_tempTopP_towels.csv:[0, -4, -16, 3, 8, 19, 6, 9, 65, 61]
Percent difference for ablation_tempTopP_time.csv:[0, -1, -5, -6, -11, -6, -20, -10, 15, 14]
Percent difference for ablation_tempTopP_powder.csv:[0, 11, -2, 14, 11, 25, 13, -10, 36, 35]
Percent difference for ablation_tempTopP_exercise.csv:[0, 2, 2, 5, 2, 2, 2, 3, 5, 6]


## RQ2:

(1b) How does styling of the input prompt impact the output quality and diversity of the prompt?

In [14]:
# List of CSV files
csv_files = [
    'data/ablation_topic_towels.csv',
    'data/ablation_topic_powder.csv',
    'data/ablation_topic_time.csv',
    'data/ablation_topic_exercise.csv',
    'data/ablation_topic_froth.csv'
]

In [15]:
# this only needs to be ran once.
dict_2_DPP = {}
dict_2_convex = {}
dict_2_centroid = {}
dict_2_nearest = {}
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    count = 0
    for column in df.columns:
        # encode the column text data into embeddings
        embeddings = model.encode(df[column].astype(str).tolist())
        # calculate the DPP
        dict_2_DPP[(csv_file, count)] = DPP_diversity(embeddings, lambda0=0.1)
        # calculate the convex hull
        dict_2_convex[(csv_file, count)] = convexhull(embeddings, n_components = 13)
        # calculate the distance to centroid
        dict_2_centroid[(csv_file, count)] = distance_to_centroid(embeddings)
        # calculate the nearest generated distance (average)
        dict_2_nearest[(csv_file, count)] = gen_gen_distance(embeddings, reduction = "ave")
        count += 1

In [18]:
# DPP
import json
# convert tuples in dictionary to strings
dict_2_DPP_str = {str(key): value for key, value in dict_2_DPP.items()}

# convert float32 values to float
dict_2_DPP_str = {key:float(value) for key, value in dict_2_DPP_str.items()}

with open("DPP_Topics.json", "w") as file:
    json.dump(dict_2_DPP_str, file)
    
# Centroid Distance
dict_2_centroid_str = {str(key): value for key, value in dict_2_centroid.items()}

dict_2_centroid_str = {key:float(value) for key, value in dict_2_centroid_str.items()}

with open("centroid_Topics.json", "w") as file:
    json.dump(dict_2_centroid_str, file)
    
# Nearest Generated Sample
dict_2_nearest_str = {str(key): value for key, value in dict_2_nearest.items()}

dict_2_nearest_str = {key:float(value) for key, value in dict_2_nearest_str.items()}

with open("nearest_Topics.json", "w") as file:
    json.dump(dict_2_nearest_str, file)

In [19]:
# Convex Hull
def convert_to_json(obj):
    if isinstance(obj, tuple):
        return {'__tuple__': True, 'items': list(obj)}
    elif isinstance(obj, np.ndarray):
        return {'__ndarray__': True, 'n_component': obj.tolist()}
    return obj


dict_2_convex_str = {str(key): value for key, value in dict_2_convex.items()}

with open("convex_Topics.json", "w") as file:
    json.dump(dict_2_convex_str, file, default = convert_to_json)

In [20]:
# to retrieve the json file
with open("DPP_Topics.json", "r") as file:
    dict_2_DPP_json = json.load(file)
dict_2_DPP_json = {eval(key): value for key, value in dict_2_DPP_json.items()}

with open("centroid_Topics.json", "r") as file:
    dict_2_centroid_json = json.load(file)
dict_2_centroid_json = {eval(key): value for key, value in dict_2_centroid_json.items()}

with open("nearest_Topics.json", "r") as file:
    dict_2_nearest_json = json.load(file)
dict_2_nearest_json = {eval(key): value for key, value in dict_2_nearest_json.items()}

In [21]:
# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("convex_Topics.json", "r") as file:
    dict_2_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_2_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_2_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_2_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_2_convex_json.items()}

In [22]:
# DPP percent difference calculation
percent_diff = {}
for key, value in dict_2_DPP_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[0, -16, -9, 5, -22, -17, -15]
Percent difference for data/ablation_topic_powder.csv:[0, 25, 20, 20, 1, -4, -9]
Percent difference for data/ablation_topic_time.csv:[0, -23, -13, 5, 13, -35, -34]
Percent difference for data/ablation_topic_exercise.csv:[0, -7, -20, 8, -13, -18, -23]
Percent difference for data/ablation_topic_froth.csv:[0, 3, -6, -19, -8, -7, -7]


In [23]:
# Nearest percent difference calculation
percent_diff = {}
for key, value in dict_2_nearest_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[0, 12, 8, -1, 18, 34, 31]
Percent difference for data/ablation_topic_powder.csv:[0, -5, -3, -4, -1, 8, 7]
Percent difference for data/ablation_topic_time.csv:[0, 8, 6, 6, 1, 21, 19]
Percent difference for data/ablation_topic_exercise.csv:[0, 0, 1, -1, 0, 1, 4]
Percent difference for data/ablation_topic_froth.csv:[0, -1, 0, -2, 0, 6, 7]


In [25]:
# Convex Hull percent difference calculation
percent_diff = {}
for key, value in dict_2_convex_json.items():
    key = eval(key)
    csv_file = key[0]
    value1 = value[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value1)

print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[0, 291, 242, 27, 905, 8030, 6362]
Percent difference for data/ablation_topic_powder.csv:[0, -40, -17, -28, -16, 377, 263]
Percent difference for data/ablation_topic_time.csv:[0, 235, 205, 192, 14, 1769, 2032]
Percent difference for data/ablation_topic_exercise.csv:[0, 0, 2, -17, -15, 49, 85]
Percent difference for data/ablation_topic_froth.csv:[0, 19, 2, 147, -17, 375, 610]


In [26]:
# Centroid Distance percent difference calculation
percent_diff = {}
for key, value in dict_2_centroid_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[0, 11, 8, 0, 18, 39, 36]
Percent difference for data/ablation_topic_powder.csv:[0, -3, -3, -3, 0, 9, 8]
Percent difference for data/ablation_topic_time.csv:[0, 8, 7, 7, 1, 23, 21]
Percent difference for data/ablation_topic_exercise.csv:[0, -1, 2, -1, 0, 2, 3]
Percent difference for data/ablation_topic_froth.csv:[0, -1, -1, -8, -1, 5, 8]


## RQ3:

(1c) Do different design matters impact the output quality of the prompt?

In [29]:
# to retrieve the json file
with open("DPP_Topics.json", "r") as file:
    dict_2_DPP_json = json.load(file)
dict_2_DPP_json = {eval(key): value for key, value in dict_2_DPP_json.items()}

with open("centroid_Topics.json", "r") as file:
    dict_2_centroid_json = json.load(file)
dict_2_centroid_json = {eval(key): value for key, value in dict_2_centroid_json.items()}

with open("nearest_Topics.json", "r") as file:
    dict_2_nearest_json = json.load(file)
dict_2_nearest_json = {eval(key): value for key, value in dict_2_nearest_json.items()}

# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("convex_TempTopP.json", "r") as file:
    dict_1_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_1_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_1_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_1_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_1_convex_json.items()}

In [32]:
# work in progress to calculate it all.