In [1]:
# Import required modules
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.special import rel_entr

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Use ggplot
matplotlib.style.use('ggplot')

features = ['target_material','target_thickness','pulse_width','energy','spot_size','intensity','power','cutoff_energy']
numeric_features = ['target_thickness', 'pulse_width', 'energy', 'spot_size', 'intensity', 'power', 'cutoff_energy']

# Load original dataset that was used to generate samples
df_original = pd.read_csv('../1_sample_preparation/source/d_clean_remove_small_samples.csv')
# Make sure required features are numeric
df_original[numeric_features] = df_original[numeric_features].astype(float)
df_original.reset_index(drop=True,inplace=True)

# Load synthetic data set
df_synthetic = pd.read_csv('../4_response_extraction/synthetic_data_rows.csv')
# Drop column
df_synthetic.drop(columns=['Unnamed: 0'],inplace=True)
# Make sure required features are numeric
df_synthetic[numeric_features] = df_synthetic[numeric_features].astype(float)
df_synthetic.reset_index(drop=True,inplace=True)

# Create a copy of the original DataFrame
df_synth_raw = df_synthetic.copy()

# Function to remove outliers for a specific feature
def remove_outliers(df, feature, lower_percentile, upper_percentile):
    Q1 = df[feature].quantile(lower_percentile)
    Q3 = df[feature].quantile(upper_percentile)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

# Remove outliers for all numeric features
for nf in numeric_features:
    df_synth_raw = remove_outliers(df_synth_raw, nf, 0.005, 0.995)

print(f"Original DataFrame length: {len(df_synthetic)}")
print(f"DataFrame length after removing outliers 'RAW' for Inf values: {len(df_synth_raw)} - Rows lost: {len(df_synthetic) - len(df_synth_raw)}")

print("\ndf_original head:",df_original.head())
print("\ndf_original info:",df_original.info())
print("\ndf_original unique prompt_method: NA")
print("\ndf_original unique prompt_short: NA")
print("\ndf_original unique sample_size: NA")
print("\ndf_original unique target_material:",df_original['target_material'].unique())
print("\ndf_original unique model: NA")

print("\ndf_synth_raw head:",df_synth_raw.head())
print("\ndf_synth_raw info:",df_synth_raw.info())
print("\ndf_synth_raw unique prompt_method:",df_synth_raw['prompt_method'].unique())
print("\ndf_synth_raw unique prompt_short:",df_synth_raw['prompt_short'].unique())
print("\ndf_synth_raw unique sample_size:",df_synth_raw['sample_size'].unique())
print("\ndf_synth_raw unique target_material:",df_synth_raw['target_material'].unique())
print("\ndf_synth_raw unique model:",df_synth_raw['model'].unique())

df_synth_raw.groupby(['model','prompt_method','sample_size']).count()


Original DataFrame length: 163300
DataFrame length after removing outliers 'RAW' for Inf values: 159860 - Rows lost: 3440

df_original head:   target_material  target_thickness  pulse_width  energy  spot_size  \
0         plastic             0.537         30.0   2.427        3.3   
1         plastic             0.293         30.0   2.395        3.3   
2         plastic             0.610         30.0   2.425        3.3   
3         plastic             0.509         30.0   2.344        3.3   
4         plastic             0.527         30.0   2.351        3.3   

      intensity         power  cutoff_energy  
0  6.561000e+20  8.091000e+13            3.3  
1  6.473000e+20  7.983000e+13            3.4  
2  6.554000e+20  8.083000e+13            3.4  
3  6.335000e+20  7.813000e+13            3.4  
4  6.356000e+20  7.838000e+13            3.4  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prompt_short,target_material,target_thickness,pulse_width,energy,spot_size,intensity,power,cutoff_energy
model,prompt_method,sample_size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
claude-3-5-sonnet-20240620,chain_of_thought,rs_size_10,375,375,375,375,375,375,375,375,375
claude-3-5-sonnet-20240620,chain_of_thought,rs_size_100,375,375,375,375,375,375,375,375,375
claude-3-5-sonnet-20240620,chain_of_thought,rs_size_150,375,375,375,375,375,375,375,375,375
claude-3-5-sonnet-20240620,chain_of_thought,rs_size_25,375,375,375,375,375,375,375,375,375
claude-3-5-sonnet-20240620,chain_of_thought,rs_size_5,375,375,375,375,375,375,375,375,375
...,...,...,...,...,...,...,...,...,...,...,...
phi3:mini-128k,reverse_prompting,rs_size_25,2,2,2,2,2,2,2,2,2
phi3:mini-128k,self_consistency,rs_size_10,1,1,1,1,1,1,1,1,1
phi3:mini-128k,skeleton_of_thought,rs_size_5,1,1,1,1,1,1,1,1,1
phi3:mini-128k,step_back_prompting,rs_size_150,1,1,1,1,1,1,1,1,1


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import entropy
from scipy.stats import norm

# Selecting numerical features for KL Divergence
features = ['target_thickness', 'pulse_width', 'energy', 'spot_size', 'intensity', 'power', 'cutoff_energy']

def prepare_data(df_synthetic, df_original):
    # Ensure df_original has all columns from df_synthetic
    missing_cols = set(df_synthetic.columns) - set(df_original.columns)
    for col in missing_cols:
        df_original[col] = np.nan

    # Add 'source' column
    df_synthetic['source'] = 'synthetic'
    df_original['source'] = 'original'

    # Combine the synthetic DataFrame with the original DataFrame
    combined_df = pd.concat([df_synthetic, df_original], ignore_index=True)

    # Standardize the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(combined_df[features])

    # Create a DataFrame with the scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features)

    # Add the source, model, prompt_method, and sample_size columns back to the scaled DataFrame
    scaled_df['source'] = combined_df['source'].values
    scaled_df['model'] = combined_df['model'].values
    scaled_df['prompt_method'] = combined_df['prompt_method'].values
    scaled_df['sample_size'] = combined_df['sample_size'].values

    return scaled_df


def calculate_multivariate_kl_divergence(p, q, epsilon=1e-10):
    p = p + epsilon
    q = q + epsilon
    p = p / np.sum(p)
    q = q / np.sum(q)
    return np.sum(p * (np.log(p) - np.log(q)))

def compute_multivariate_kl_divergence(df_synthetic, df_original):
    scaled_df = prepare_data(df_synthetic, df_original)

    kl_results = {}
    row_counts = {}

    for model_name in df_synthetic['model'].unique():
        for prompt_method in df_synthetic['prompt_method'].unique():
            for sample_size in df_synthetic['sample_size'].unique():
                key = (model_name, prompt_method, sample_size)
                synthetic_data = scaled_df[(scaled_df['model'] == model_name) & 
                                           (scaled_df['prompt_method'] == prompt_method) & 
                                           (scaled_df['sample_size'] == sample_size) & 
                                           (scaled_df['source'] == 'synthetic')]
                original_data = scaled_df[scaled_df['source'] == 'original']

                row_counts[key] = len(synthetic_data)

                # Extract feature values
                synthetic_values = synthetic_data[features].values
                original_values = original_data[features].values

                # Compute multivariate histograms
                bins = [10] * len(features)  # 10 bins for each feature
                hist_synthetic, _ = np.histogramdd(synthetic_values, bins=bins)
                hist_original, _ = np.histogramdd(original_values, bins=bins)

                # Calculate multivariate KL divergence
                kl_divergence = calculate_multivariate_kl_divergence(hist_synthetic, hist_original)
                kl_results[key] = kl_divergence

    kl_df = pd.DataFrame(list(kl_results.items()), columns=['key', 'kl_divergence'])
    kl_df[['model', 'prompt_method', 'sample_size']] = pd.DataFrame(kl_df['key'].tolist(), index=kl_df.index)
    kl_df['row_count'] = kl_df['key'].map(row_counts)
    kl_df = kl_df.drop('key', axis=1)

    # Add a penalty term based on sample size
    max_row_count = kl_df['row_count'].max()
    kl_df['sample_size_penalty'] = np.sqrt(max_row_count / kl_df['row_count'])

    # Calculate the adjusted KL divergence
    kl_df['adjusted_kl'] = kl_df['kl_divergence'] * kl_df['sample_size_penalty']

    kl_df_sorted = kl_df.sort_values(by='adjusted_kl', ascending=True)
    kl_df_sorted = kl_df_sorted.reset_index(drop=True)

    return kl_df_sorted

# Function to display top N results
def display_top_results(df, n=10):
    return df.head(n)

# Compute multivariate KL divergence for each synthetic dataset
kl_df_raw = compute_multivariate_kl_divergence(df_synth_raw, df_original)

# Print the results
print("Top 10 results for df_synth_raw:")
print(display_top_results(kl_df_raw))

Top 10 results for df_synth_raw:
   kl_divergence                       model  \
0       5.273548  claude-3-5-sonnet-20240620   
1       5.436909          gpt-3.5-turbo-0125   
2       5.510021  claude-3-5-sonnet-20240620   
3       5.673325  claude-3-5-sonnet-20240620   
4       5.866612    claude-3-sonnet-20240229   
5       6.943939  claude-3-5-sonnet-20240620   
6       6.842583    claude-3-sonnet-20240229   
7       7.070010          gpt-3.5-turbo-0125   
8       7.226937  claude-3-5-sonnet-20240620   
9       7.324370  claude-3-5-sonnet-20240620   

                         prompt_method  sample_size  row_count  \
0                        least_to_most    rs_size_5        375   
1  recursive_criticism_and_improvement   rs_size_10        373   
2                     self_consistency  rs_size_150        375   
3                  step_back_prompting  rs_size_150        374   
4                  generated_knowledge   rs_size_50        375   
5                  skeleton_of_thought  rs

In [6]:
kl_df_raw.head(30)

Unnamed: 0,kl_divergence,model,prompt_method,sample_size,row_count,sample_size_penalty,adjusted_kl
0,5.273548,claude-3-5-sonnet-20240620,least_to_most,rs_size_5,375,1.0,5.273548
1,5.436909,gpt-3.5-turbo-0125,recursive_criticism_and_improvement,rs_size_10,373,1.002677,5.451466
2,5.510021,claude-3-5-sonnet-20240620,self_consistency,rs_size_150,375,1.0,5.510021
3,5.673325,claude-3-5-sonnet-20240620,step_back_prompting,rs_size_150,374,1.001336,5.680905
4,5.866612,claude-3-sonnet-20240229,generated_knowledge,rs_size_50,375,1.0,5.866612
5,6.943939,claude-3-5-sonnet-20240620,skeleton_of_thought,rs_size_150,375,1.0,6.943939
6,6.842583,claude-3-sonnet-20240229,least_to_most,rs_size_5,362,1.017797,6.964363
7,7.07001,gpt-3.5-turbo-0125,step_back_prompting,rs_size_10,361,1.019206,7.205798
8,7.226937,claude-3-5-sonnet-20240620,generated_knowledge,rs_size_150,375,1.0,7.226937
9,7.32437,claude-3-5-sonnet-20240620,directional_stimuli,rs_size_10,374,1.001336,7.334155
