In [1]:
import scipy.stats  
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp  
import itertools  

In [2]:
def read_data(input_data_filepath):
    # load data to pandas dataframe
    input_df = pd.read_csv(input_data_filepath)

    # Find all column/features with categorical value
    original_categorical_columns = []
    categorical_len_count = 0
    for col in input_df:
        # Do not process the value
        if len(input_df[col].unique()) <= 10:
            original_categorical_columns.append(col)
            categorical_len_count += len(input_df[col].unique())

    original_continuous_columns = list(set(input_df.columns.values.tolist()) - set(original_categorical_columns))

    return input_df

In [3]:
original_data = pd.read_csv("data/toy_data/toy_syn_data.csv")
synthetic_data = read_data("syn_data/toy_syn_data.csv")

synthetic_data
# numeric_cols = synthetic_data.select_dtypes(include=[np.number]).columns  
# synthetic_data[numeric_cols] = synthetic_data[numeric_cols].astype(float)  

Unnamed: 0,categorical_0,continuous_0,continuous_1
0,9,101,99
1,9,100,100
2,9,100,102
3,9,98,100
4,9,99,99
...,...,...,...
995,10,101,100
996,9,101,99
997,10,101,98
998,10,99,99


In [4]:
original_data

Unnamed: 0,categorical_0,continuous_0,continuous_1
0,11,100,98
1,10,100,98
2,10,99,100
3,11,100,99
4,11,100,99
...,...,...,...
995,10,100,102
996,9,101,99
997,10,100,99
998,8,98,98


# Single dim query

## Entropy (lower the better, ideal <0.05 for each dim)

In [5]:
# Calculate entropy  
def get_entropy(df):  
    return df.apply(lambda x: scipy.stats.entropy(x.value_counts()))  
  
# Calculate entropy for both dataframes  
original_entropy = get_entropy(original_data)  
synthetic_entropy = get_entropy(synthetic_data)  
  
# Calculate the difference in entropy  
entropy_difference = np.abs(original_entropy - synthetic_entropy)/original_entropy  
  
# Convert to DataFrame and transpose  
entropy_difference_df = pd.DataFrame(entropy_difference).transpose()  
entropy_difference_df.index = ['entropy_difference'] 
entropy_difference_df

Unnamed: 0,categorical_0,continuous_0,continuous_1
entropy_difference,0.027262,0.216931,0.240456


## Top-K (lower the better, ideal<0.1 for each dim)

In [6]:
# Top-K query  
K = 1000 
  
# Define a function to get top-K values  
def get_top_k_values(df, k):  
    return df.apply(lambda x: x.value_counts().index[:k].to_list())  
  
# Get top-K values from both dataframes  
original_top_k = get_top_k_values(original_data, K)   
synthetic_top_k = get_top_k_values(synthetic_data, K)  

recall_scores = []  
  
for a, b in zip(original_top_k, synthetic_top_k):  
    a_set = set(a)  
    b_set = set(b)  
    intersection = a_set & b_set  
    recall = len(intersection) / len(a_set)  
    recall_scores.append(recall)  
    
recall_scores

# Convert to DataFrame and transpose  
top_k_difference_df = pd.DataFrame([recall_scores], columns=original_data.columns)  
top_k_difference_df.index = ['top_k_recall']
top_k_difference_df['mean'] = top_k_difference_df.mean(axis=1)
top_k_difference_df

Unnamed: 0,categorical_0,continuous_0,continuous_1,mean
top_k_recall,1.0,1.0,1.0,1.0


In [7]:
synthetic_top_k

categorical_0                      [10, 9, 8, 11]
continuous_0     [100, 99, 98, 101, 102, 97, 103]
continuous_1      [99, 100, 98, 101, 97, 102, 96]
dtype: object

## Quantile (lower the better, ideal <0.1 for each dim)

In [8]:
# Create an empty Series to store the results  
ks_results = pd.Series(name='Max-quantile-difference')  
  
# Iterate over columns in original_data  
for column in original_data.columns:  
    # If the same column exists in synthetic_data  
    if column in synthetic_data.columns:  
        # Perform KS test  
        statistic, pvalue = ks_2samp(original_data[column], synthetic_data[column])  
        # Append the results to the ks_results series  
        ks_results[column] = statistic  

# Convert the Series to a DataFrame  
ks_results = ks_results.to_frame().T  
# Print the resulting dataframe  
ks_results

  ks_results = pd.Series(name='Max-quantile-difference')


Unnamed: 0,categorical_0,continuous_0,continuous_1
Max-quantile-difference,0.026,0.099,0.1


In [9]:
# Concatenate all dataframes  
single_dim_result_df = pd.concat([ks_results, top_k_difference_df, entropy_difference_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
single_dim_result_df['mean'] = single_dim_result_df.mean(axis=1) 
single_dim_result_df['std'] = single_dim_result_df.std(axis=1)
single_dim_result_df = single_dim_result_df.round(3)
single_dim_result_df
# all_values = single_dim_result_df.values.tolist()  
# print(all_values)

Unnamed: 0,categorical_0,continuous_0,continuous_1,mean,std
Max-quantile-difference,0.026,0.099,0.1,0.075,0.035
top_k_recall,1.0,1.0,1.0,1.0,0.0
entropy_difference,0.027,0.217,0.24,0.162,0.095


# Cross-dim query (Use 2-dim)

## Cardinality (lower the better, ideal <0.1 for each dim)

In [10]:
def calculate_2d_cardinality(df):  
    # Get all combinations of 2 dimensions  
    combinations = list(itertools.combinations(df.columns, 2))  
  
    cardinalities = {}  
    for combo in combinations:  
        # Calculate 2-dimension cardinality for each combination  
        cardinalities[combo] = df.groupby(list(combo)).size().reset_index().rename(columns={0:'count'}).shape[0]  
          
    return cardinalities  
  
# Calculate 2-dimension cardinality for each df  
original_cardinality = calculate_2d_cardinality(original_data)  
syn_cardinality = calculate_2d_cardinality(synthetic_data)  
original_cardinality

# Calculate error rate  
error_rate = {key: (abs(syn_cardinality[key] - original_cardinality[key]) / original_cardinality[key]) for key in original_cardinality.keys()}  
error_rate
# # Create a DataFrame for the accuracy  
two_dim_cardinality_error_rate_df = pd.DataFrame.from_dict(error_rate, orient='index', columns=['cardinality_err_rate']).T  
two_dim_cardinality_error_rate_df

Unnamed: 0,"(categorical_0, continuous_0)","(categorical_0, continuous_1)","(continuous_0, continuous_1)"
cardinality_err_rate,0.12,0.217391,0.382353


## L2 norm (lower the better, ideal <0.05 for each dim)

In [11]:
from sklearn.preprocessing import MinMaxScaler    
  
def calculate_l2_norm_error_2_dim(original_df, syn_df):    
    scaler = MinMaxScaler()    
  
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:    
        original_data_norm = scaler.fit_transform(original_df[list(column_comb)])    
        syn_data_norm = scaler.transform(syn_df[list(column_comb)])    
  
        l2_norm_original = np.linalg.norm(original_data_norm, 2)    
        l2_norm_syn = np.linalg.norm(syn_data_norm, 2)    
  
        l2_norm_error = abs(l2_norm_syn - l2_norm_original) / l2_norm_original    
        error_dict[column_comb] = l2_norm_error    
  
    error_df = pd.DataFrame.from_dict(error_dict, orient='index', columns=['l2_norm_error_2d']).T     
  
    return error_df    
  
l2_error_df = calculate_l2_norm_error_2_dim(original_data, synthetic_data)   
l2_error_df


Unnamed: 0,"(categorical_0, continuous_0)","(categorical_0, continuous_1)","(continuous_0, continuous_1)"
l2_norm_error_2d,0.034336,0.001767,0.014733


## Frequency estimation (lower the better, ideal <0.1 for each dim)

In [12]:
def calculate_frequency_error(original_df, syn_df):    
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:
        # calculate the frequencies of unique values in each pair of columns
        original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)  
        syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)  
  
        # Ensure both frequency distributions have the same index for comparison  
        all_index = original_freq.index.union(syn_freq.index)  
        original_freq = original_freq.reindex(all_index, fill_value=0)  
        syn_freq = syn_freq.reindex(all_index, fill_value=0)  
  
        # Calculate frequency estimation error, 
        # Divide by 2 is to account for over-counting, because the sum of differences in a distribution always sums up to 2  
        freq_error = np.abs(original_freq - syn_freq).sum() / 2  
        error_dict[column_comb] = freq_error  
  
    error_df =  pd.DataFrame.from_dict(error_dict, orient='index', columns=['freq_err_2d']).T  
  
    return error_df    
  
freq_error_df = calculate_frequency_error(original_data, synthetic_data)  
freq_error_df

Unnamed: 0,"(categorical_0, continuous_0)","(categorical_0, continuous_1)","(continuous_0, continuous_1)"
freq_err_2d,0.104,0.1035,0.1595


In [26]:
# Concatenate all dataframes  
cross_dim_result_df = pd.concat([two_dim_cardinality_error_rate_df, l2_error_df, freq_error_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
cross_dim_result_df['mean'] = cross_dim_result_df.mean(axis=1) 
cross_dim_result_df['std'] = cross_dim_result_df.std(axis=1)  
cross_dim_result_df = cross_dim_result_df.round(2)
cross_dim_result_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)",mean,std
cardinality_err_rate,0.09,0.52,0.03,0.55,0.07,0.59,0.07,0.27,0.47,0.06,...,0.46,0.6,0.08,0.54,0.07,0.34,0.09,0.56,0.31,0.2
l2_norm_error_2d,0.18,0.18,0.01,0.2,0.18,0.17,0.0,0.19,0.01,0.21,...,0.01,0.0,0.22,0.22,0.0,0.19,0.0,0.0,0.11,0.09
freq_err_2d,0.61,0.73,0.41,0.77,0.59,0.84,0.41,0.5,0.21,0.59,...,0.44,0.0,0.57,0.81,0.39,0.6,0.2,0.44,0.48,0.21


In [72]:
cross_all_values = cross_dim_result_df.values.tolist()  
print(cross_all_values)

[[0.08], [0.0], [0.44]]
