In [16]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import torch
import numpy as np
import json
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA
import tensorflow as tf

In [17]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [18]:
def convexhull(x, n_components):
    pca = PCA(n_components = n_components)
    reduced_data = pca.fit_transform(x)
    explained_ratio = pca.explained_variance_ratio_
    hull = ConvexHull(reduced_data)
    volume = hull.volume
    return volume, explained_ratio

In [34]:
def percent_change(values):
    lst = []
    for i in range(len(values)):
        percent_change = ((values[i] - values[len(values)-1])/abs(values[len(values)-1]))*100
        lst.append(round(percent_change))
    return lst

In [20]:
def DPP_diversity(x, lambda0=0):
    x = tf.convert_to_tensor(x, dtype='float32')
    
    # Normalize the rows of x to have unit norm, which is required for cosine similarity
    x_normalized = tf.linalg.normalize(x, axis=1)[0]
    
    # Compute the cosine similarity matrix
    S = tf.matmul(x_normalized, tf.transpose(x_normalized))
    
    # Transform cosine similarity values to be non-negative
    S_non_negative = (S + 1.0) / 2.0
    
    # Create a vector of ones with the same length as the number of points in x
    y = tf.ones(np.shape(x)[0])
    
    # Compute the outer product of y with itself, resulting in a matrix of Q where all elements are equal to 1
    Q = tf.tensordot(tf.expand_dims(y, 1), tf.expand_dims(y, 0), 1)
    if lambda0 == 0:
        L = S_non_negative
    else:
        L= S*tf.math.pow(Q, lambda0)
    # Compute the eigenvalues of L
    eig_val, _  = tf.linalg.eigh(L)
    # compute the log-determinant of L using the eigenvalues
    log_det_L = -tf.reduce_mean(tf.math.log(tf.math.maximum(eig_val, 1e-7)))
    return log_det_L.numpy()

In [6]:
def distance_to_centroid(embeddings):
    distances = []
    for i in range(embeddings.shape[0]):
        pca = PCA(n_components = 20)
        embeddings = pca.fit_transform(embeddings)
        mean = np.mean(embeddings[i])
        dist = np.sqrt(np.sum(np.square(np.subtract(embeddings[i], mean))))
        distances.append(dist)
    return np.mean(np.array(distances))

In [7]:
def L2_vectorized(X, Y):
    #Vectorize L2 calculation using x^2+y^2-2xy
    X_sq = np.sum(np.square(X), axis=1)
    Y_sq = np.sum(np.square(Y), axis=1)
    sq = np.add(np.expand_dims(X_sq, axis=-1), np.transpose(Y_sq)) - 2*np.matmul(X,np.transpose(Y))
    sq = np.clip(sq, 0.0, 1e12)
    return np.sqrt(sq)

In [8]:
def calc_distance(X, Y, distance="Euclidean"):
    if distance=="Euclidean":
        return L2_vectorized(X,Y)
    else:
        raise Exception("Unknown distance metric specified")

In [9]:
def gen_gen_distance(embeddings, reduction):
    x = embeddings
    res = calc_distance(x, x, distance = "Euclidean")
    # this sets the diagonal of the matrix to the maximum of elements across the column dimension (axis = 1)
    res = tf.linalg.set_diag(res, tf.reduce_max(res, axis=1))
    # pick the smallest values along the columns
    if reduction == "min":
        scores = tf.reduce_min(res, axis=1)
    # pick the average value along the columns
    elif reduction == "ave":
        scores = tf.reduce_mean(res, axis=1)
    else:
        raise Exception("Unknown reduction method")
    return np.mean(scores.numpy())

## RQ1:

(1a) How do parameters such as temperature and Top P affect the quality and diversity of the generated text output?

In [21]:
# List of CSV files
csv_files = [
    'data/ablation_tempTopP_froth.csv',
    'data/ablation_tempTopP_towels.csv',
    'data/ablation_tempTopP_time.csv',
    'data/ablation_tempTopP_powder.csv',
    'data/ablation_tempTopP_exercise.csv'
]

In [35]:
# this only needs to be ran once.
dict_1_DPP = {}
#dict_1_convex = {}
#dict_1_centroid = {}
#dict_1_nearest = {}
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    count = 0
    for column in df.columns:
        # encode the column text data into embeddings
        embeddings = model.encode(df[column].astype(str).tolist())
        # calculate the DPP
        dict_1_DPP[(csv_file, count)] = DPP_diversity(embeddings, lambda0=0)
        # calculate the convex hull
        #dict_1_convex[(csv_file, count)] = convexhull(embeddings, n_components = 13)
        # calculate the distance to centroid
        #dict_1_centroid[(csv_file, count)] = distance_to_centroid(embeddings)
        # calculate the nearest generated distance (average)
        #dict_1_nearest[(csv_file, count)] = gen_gen_distance(embeddings, reduction = "ave")
        count += 1
        

In [36]:
# DPP
# convert tuples in dictionary to strings
dict_1_DPP_str = {str(key): value for key, value in dict_1_DPP.items()}

# convert float32 values to float
dict_1_DPP_str = {key:float(value) for key, value in dict_1_DPP_str.items()}

with open("data/DPP_TempTopP.json", "w") as file:
    json.dump(dict_1_DPP_str, file)

In [34]:
# save the dictionary results into a json file
    
# Centroid Distance
dict_1_centroid_str = {str(key): value for key, value in dict_1_centroid.items()}

dict_1_centroid_str = {key:float(value) for key, value in dict_1_centroid_str.items()}

with open("data/centroid_TempTopP.json", "w") as file:
    json.dump(dict_1_centroid_str, file)
    
# Nearest Generated Sample
dict_1_nearest_str = {str(key): value for key, value in dict_1_nearest.items()}

dict_1_nearest_str = {key:float(value) for key, value in dict_1_nearest_str.items()}

with open("data/nearest_TempTopP.json", "w") as file:
    json.dump(dict_1_nearest_str, file)

In [28]:
# to retrieve the json file
with open("data/DPP_TempTopP.json", "r") as file:
    dict_1_DPP_json = json.load(file)
dict_1_DPP_json = {eval(key): value for key, value in dict_1_DPP_json.items()}

with open("data/centroid_TempTopP.json", "r") as file:
    dict_1_centroid_json = json.load(file)
dict_1_centroid_json = {eval(key): value for key, value in dict_1_centroid_json.items()}

with open("data/nearest_TempTopP.json", "r") as file:
    dict_1_nearest_json = json.load(file)
dict_1_nearest_json = {eval(key): value for key, value in dict_1_nearest_json.items()}

In [36]:
# Convex hull is a bit trickier...

# Convert tuples and arrays to compatible format
# custom conversion function that converts tuples to a dictionary with a special key and arrays
# to a dictionary with a special key
def convert_to_json(obj):
    if isinstance(obj, tuple):
        return {'__tuple__': True, 'items': list(obj)}
    elif isinstance(obj, np.ndarray):
        return {'__ndarray__': True, 'n_component': obj.tolist()}
    return obj


dict_1_convex_str = {str(key): value for key, value in dict_1_convex.items()}

with open("data/convex_TempTopP.json", "w") as file:
    json.dump(dict_1_convex_str, file, default = convert_to_json)

In [37]:
# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("data/convex_TempTopP.json", "r") as file:
    dict_1_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_1_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_1_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_1_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_1_convex_json.items()}

In [37]:
# DPP percent difference calculation
percent_diff = {}
for key, value in dict_1_DPP_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(csv_file)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

data/ablation_tempTopP_froth.csv
Percent difference for data/ablation_tempTopP_froth.csv:[-24, -25, -24, -25, -19, -8, -22, -24, 0, 0]
data/ablation_tempTopP_towels.csv
Percent difference for data/ablation_tempTopP_towels.csv:[-48, -42, -69, -48, -41, -20, -46, -33, 2, 0]
data/ablation_tempTopP_time.csv
Percent difference for data/ablation_tempTopP_time.csv:[-27, -27, -42, -30, -40, -20, -53, -43, 0, 0]
data/ablation_tempTopP_powder.csv
Percent difference for data/ablation_tempTopP_powder.csv:[-36, -25, -41, -22, -30, -5, -20, -50, 0, 0]
data/ablation_tempTopP_exercise.csv
Percent difference for data/ablation_tempTopP_exercise.csv:[-26, -28, -23, -16, -14, -6, -23, -14, -6, 0]


In [39]:
# Nearest Generated Difference percent difference calculation
percent_diff = {}
for key, value in dict_1_nearest_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for data/ablation_tempTopP_froth.csv:[-18, -19, -19, -21, -20, -12, -15, -21, -2, 0]
Percent difference for data/ablation_tempTopP_towels.csv:[-38, -38, -48, -36, -33, -24, -35, -31, 3, 0]
Percent difference for data/ablation_tempTopP_time.csv:[-14, -14, -20, -18, -23, -16, -30, -22, 1, 0]
Percent difference for data/ablation_tempTopP_powder.csv:[-26, -18, -27, -17, -19, -7, -16, -34, 0, 0]
Percent difference for data/ablation_tempTopP_exercise.csv:[-9, -8, -7, -4, -5, -4, -7, -5, -2, 0]


In [40]:
# Convex Hull percent difference calculation
percent_diff = {}
for key, value in dict_1_convex_json.items():
    key = eval(key)
    csv_file = key[0]
    value1 = value[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value1)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for data/ablation_tempTopP_froth.csv:[-94, -94, -94, -96, -97, -88, -87, -97, -34, 0]
Percent difference for data/ablation_tempTopP_towels.csv:[-100, -100, -100, -100, -100, -98, -100, -100, 26, 0]
Percent difference for data/ablation_tempTopP_time.csv:[-93, -91, -96, -96, -98, -95, -99, -97, -12, 0]
Percent difference for data/ablation_tempTopP_powder.csv:[-99, -96, -99, -94, -96, -72, -94, -100, 32, 0]
Percent difference for data/ablation_tempTopP_exercise.csv:[-74, -68, -67, -37, -57, -46, -66, -63, -19, 0]


In [41]:
# Centroid Distance percent difference calculation
percent_diff = {}
for key, value in dict_1_centroid_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)" )
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes (TopP=0|Temperature=0, TopP=0.5|Temperature=0, TopP=1|Temperature=0, TopP=0|Temperature=1, TopP=0.5|Temperature=1, TopP=1|Temperature=1, TopP=0|Temperature=2, TopP=0.5|Temperature=2, Human 50 v1, Human 50 v2)

Percent difference for data/ablation_tempTopP_froth.csv:[-17, -18, -18, -21, -22, -12, -12, -21, -3, 0]
Percent difference for data/ablation_tempTopP_towels.csv:[-38, -40, -48, -36, -33, -27, -34, -33, 2, 0]
Percent difference for data/ablation_tempTopP_time.csv:[-12, -13, -17, -18, -22, -17, -30, -21, 1, 0]
Percent difference for data/ablation_tempTopP_powder.csv:[-26, -18, -27, -16, -18, -8, -17, -34, 1, 0]
Percent difference for data/ablation_tempTopP_exercise.csv:[-5, -4, -3, -1, -3, -3, -3, -3, -1, 0]


## RQ2:

(1b) How does styling of the input prompt impact the output quality and diversity of the prompt?

In [70]:
# List of CSV files
csv_files = [
    'data/ablation_topic_towels.csv',
    'data/ablation_topic_powder.csv',
    'data/ablation_topic_time.csv',
    'data/ablation_topic_exercise.csv',
    'data/ablation_topic_froth.csv'
]

In [71]:
# this only needs to be ran once.
dict_2_DPP = {}
dict_2_convex = {}
dict_2_centroid = {}
dict_2_nearest = {}
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    count = 0
    for column in df.columns:
        # encode the column text data into embeddings
        embeddings = model.encode(df[column].astype(str).tolist())
        # calculate the DPP
        dict_2_DPP[(csv_file, count)] = DPP_diversity(embeddings, lambda0=0)
        # calculate the convex hull
        dict_2_convex[(csv_file, count)] = convexhull(embeddings, n_components = 13)
        # calculate the distance to centroid
        dict_2_centroid[(csv_file, count)] = distance_to_centroid(embeddings)
        # calculate the nearest generated distance (average)
        dict_2_nearest[(csv_file, count)] = gen_gen_distance(embeddings, reduction = "ave")
        count += 1

In [72]:
# DPP
import json
# convert tuples in dictionary to strings
dict_2_DPP_str = {str(key): value for key, value in dict_2_DPP.items()}

# convert float32 values to float
dict_2_DPP_str = {key:float(value) for key, value in dict_2_DPP_str.items()}

with open("data/DPP_Topics.json", "w") as file:
    json.dump(dict_2_DPP_str, file)

In [73]:
# Centroid Distance
dict_2_centroid_str = {str(key): value for key, value in dict_2_centroid.items()}

dict_2_centroid_str = {key:float(value) for key, value in dict_2_centroid_str.items()}

with open("data/centroid_Topics.json", "w") as file:
    json.dump(dict_2_centroid_str, file)
    
# Nearest Generated Sample
dict_2_nearest_str = {str(key): value for key, value in dict_2_nearest.items()}

dict_2_nearest_str = {key:float(value) for key, value in dict_2_nearest_str.items()}

with open("data/nearest_Topics.json", "w") as file:
    json.dump(dict_2_nearest_str, file)

In [74]:
# Convex Hull
def convert_to_json(obj):
    if isinstance(obj, tuple):
        return {'__tuple__': True, 'items': list(obj)}
    elif isinstance(obj, np.ndarray):
        return {'__ndarray__': True, 'n_component': obj.tolist()}
    return obj


dict_2_convex_str = {str(key): value for key, value in dict_2_convex.items()}

with open("data/convex_Topics.json", "w") as file:
    json.dump(dict_2_convex_str, file, default = convert_to_json)

In [75]:
# to retrieve the json file
with open("data/DPP_Topics.json", "r") as file:
    dict_2_DPP_json = json.load(file)
dict_2_DPP_json = {eval(key): value for key, value in dict_2_DPP_json.items()}

with open("data/centroid_Topics.json", "r") as file:
    dict_2_centroid_json = json.load(file)
dict_2_centroid_json = {eval(key): value for key, value in dict_2_centroid_json.items()}

with open("data/nearest_Topics.json", "r") as file:
    dict_2_nearest_json = json.load(file)
dict_2_nearest_json = {eval(key): value for key, value in dict_2_nearest_json.items()}

In [76]:
# to convert convex hull back to the original dictionary with tuples and numpy array, we can use a custom decoder function

def custom_decoder(obj):
    if '__tuple__' in obj:
        return tuple(obj['items'])
    elif '__ndarray__' in obj:
        return np.array(obj['n_component'])
    return obj

with open("data/convex_Topics.json", "r") as file:
    dict_2_convex_json = json.load(file)

# Convert tuples and NumPy arrays back to original format
dict_2_convex_json = {key: (value[0], value[1]['n_component']) for key, value in dict_2_convex_json.items()}

# we can clean it up further by summing all the values in the n_components to get the sum of information retained
dict_2_convex_json = {key: (value[0], sum(value[1])) for key, value in dict_2_convex_json.items()}

In [77]:
# DPP percent difference calculation
percent_diff = {}
for key, value in dict_2_DPP_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[20, 8, 13, 23, 1, 0, 21, 2, -2, 0]
Percent difference for data/ablation_topic_powder.csv:[5, 16, 10, 10, 5, -4, 14, 0, 0, 0]
Percent difference for data/ablation_topic_time.csv:[20, 8, 13, 16, 21, 14, 22, 25, 0, 0]
Percent difference for data/ablation_topic_exercise.csv:[6, 6, 3, 10, 6, -1, 2, 3, 6, 0]
Percent difference for data/ablation_topic_froth.csv:[8, 10, 6, 8, 7, 5, 8, 5, 0, 0]


In [78]:
# Nearest percent difference calculation
percent_diff = {}
for key, value in dict_2_nearest_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[-24, -15, -17, -24, -10, -8, -22, -9, 3, 0]
Percent difference for data/ablation_topic_powder.csv:[-7, -11, -10, -10, -7, 1, -12, -4, 0, 0]
Percent difference for data/ablation_topic_time.csv:[-16, -9, -11, -11, -16, -13, -17, -18, 1, 0]
Percent difference for data/ablation_topic_exercise.csv:[-4, -4, -2, -5, -4, -1, -1, -4, -2, 0]
Percent difference for data/ablation_topic_froth.csv:[-12, -13, -12, -11, -12, -9, -11, -9, -2, 0]


In [79]:
# Convex Hull percent difference calculation
percent_diff = {}
for key, value in dict_2_convex_json.items():
    key = eval(key)
    csv_file = key[0]
    value1 = value[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value1)

print("Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")

Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2

Percent difference for data/ablation_topic_towels.csv:[-98, -94, -95, -98, -84, -81, -97, -87, 26, 0]
Percent difference for data/ablation_topic_powder.csv:[-72, -84, -77, -80, -77, -27, -85, -54, 32, 0]
Percent difference for data/ablation_topic_time.csv:[-95, -84, -86, -86, -95, -92, -95, -96, -12, 0]
Percent difference for data/ablation_topic_exercise.csv:[-46, -46, -45, -55, -54, -47, -37, -58, -19, 0]
Percent difference for data/ablation_topic_froth.csv:[-88, -87, -87, -86, -90, -79, -78, -80, -34, 0]


In [None]:
# Centroid Distance percent difference calculation
percent_diff = {}
for key, value in dict_2_centroid_json.items():
    csv_file = key[0]
    if csv_file not in percent_diff:
        percent_diff[csv_file] = []
    percent_diff[csv_file].append(value)

print("Note order goes zero-shot, few-shot, novel, unique, creative, critique-critique, design-expert, farfetched, human-1, human-2")
print("")
for csv_file, values in percent_diff.items():
    percent_changes = percent_change(values)
    print(f"Percent difference for {csv_file}:{percent_changes}")