In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [8]:
csv_files = [
    'data/ablation_tempTopP_froth.csv',
    'data/ablation_tempTopP_towels.csv',
    'data/ablation_tempTopP_time.csv',
    'data/ablation_tempTopP_powder.csv',
    'data/ablation_tempTopP_exercise.csv',
    'data/ablation_topic_towels.csv',
    'data/ablation_topic_powder.csv',
    'data/ablation_topic_time.csv',
    'data/ablation_topic_exercise.csv',
    'data/ablation_topic_froth.csv'
]

In [9]:
def calculate_pca_information_loss(embeddings, dimension):
    pca = PCA(n_components=dimension)  
    pca.fit(embeddings)
    explained_variance_ratio = np.sum(pca.explained_variance_ratio_)
    information_loss = 1 - explained_variance_ratio
    return information_loss


Calculate PCA information loss for dimension n = 13.

In [15]:
# Calculate PCA information loss for each CSV file and each column (n=13 dimensino)
information_loss_results_13 = {}

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    for column in df.columns:
        embeddings = model.encode(df[column].astype(str).tolist())
        loss = calculate_pca_information_loss(embeddings, 13)
        information_loss_results_13[(csv_file, column)] = loss

In [16]:
for (file, column), loss in information_loss_results_13.items():
    print(f'Information loss for {file}, column {column}: {loss:.4f}')

Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=0: 0.3510
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=0: 0.3602
Information loss for data/ablation_tempTopP_froth.csv, column TopP=1|Temperature=0: 0.3625
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=1: 0.3769
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=1: 0.4142
Information loss for data/ablation_tempTopP_froth.csv, column TopP=1|Temperature=1: 0.4036
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=2: 0.3210
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=2: 0.3810
Information loss for data/ablation_tempTopP_froth.csv, column Human-50-v1: 0.3796
Information loss for data/ablation_tempTopP_froth.csv, column Human-50-v2: 0.3555
Information loss for data/ablation_tempTopP_towels.csv, column TopP=0|Temperature=0: 0.3548
Informatio

Calculate information loss for dimension n = 20

In [17]:
# Calculate PCA information loss for each CSV file and each column (n=13 dimensino)
information_loss_results_20 = {}

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    for column in df.columns:
        embeddings = model.encode(df[column].astype(str).tolist())
        loss = calculate_pca_information_loss(embeddings, 20)
        information_loss_results_20[(csv_file, column)] = loss

In [18]:
for (file, column), loss in information_loss_results_20.items():
    print(f'Information loss for {file}, column {column}: {loss:.4f}')

Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=0: 0.2150
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=0: 0.2193
Information loss for data/ablation_tempTopP_froth.csv, column TopP=1|Temperature=0: 0.2228
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=1: 0.2348
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=1: 0.2664
Information loss for data/ablation_tempTopP_froth.csv, column TopP=1|Temperature=1: 0.2576
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0|Temperature=2: 0.2002
Information loss for data/ablation_tempTopP_froth.csv, column TopP=0.5|Temperature=2: 0.2402
Information loss for data/ablation_tempTopP_froth.csv, column Human-50-v1: 0.2386
Information loss for data/ablation_tempTopP_froth.csv, column Human-50-v2: 0.2194
Information loss for data/ablation_tempTopP_towels.csv, column TopP=0|Temperature=0: 0.2178
Informatio

In [19]:
# average information loss for n=13 dimension
loss_values = list(information_loss_results_13.values())
average_loss = np.mean(loss_values)
median_loss = np.median(loss_values)

print(f'\nAverage information loss for n=13: {average_loss:.4f}')
print(f'Median information loss for n=13: {median_loss:.4f}')


Average information loss for n=13: 0.3866
Median information loss for n=13: 0.3918


In [20]:
# average information loss for n=20 dimension
loss_values = list(information_loss_results_20.values())
average_loss = np.mean(loss_values)
median_loss = np.median(loss_values)

print(f'\nAverage information loss for n=20: {average_loss:.4f}')
print(f'Median information loss for n=20: {median_loss:.4f}')


Average information loss for n=20: 0.2424
Median information loss for n=20: 0.2469
