In [3]:
import scipy.stats  
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp  
import itertools  

In [8]:
def read_data(input_data_filepath):
    # load data to pandas dataframe
    input_df = pd.read_csv(input_data_filepath)

    # Find all column/features with categorical value
    original_categorical_columns = []
    categorical_len_count = 0
    for col in input_df:
        # Do not process the value
        if len(input_df[col].unique()) <= 10:
            original_categorical_columns.append(col)
            categorical_len_count += len(input_df[col].unique())

    original_continuous_columns = list(set(input_df.columns.values.tolist()) - set(original_categorical_columns))

    return input_df

In [9]:
original_data = pd.read_csv("data/all_data/vae_data_processed/network_traffic.csv")
synthetic_data = read_data("syn_data/network_traffic.csv")

synthetic_data
# numeric_cols = synthetic_data.select_dtypes(include=[np.number]).columns  
# synthetic_data[numeric_cols] = synthetic_data[numeric_cols].astype(float)  

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability
0,0.0,9.0,225083.0,0,1901.0,8.0,0.0,255
1,14447822.0,0.0,220194.0,0,14390350.0,0.0,380996.0,255
2,3082624.0,6.0,285116.0,0,4098360.0,8.0,891137.0,255
3,3862951.0,0.0,226166.0,0,3243851.0,4.0,67.0,255
4,3873186.0,36.0,0.0,9,4068804.0,0.0,0.0,255
...,...,...,...,...,...,...,...,...
453684,878.0,9.0,0.0,0,792.0,8.0,285794.0,255
453685,2384375.0,6.0,289172.0,0,4207258.0,12.0,54.0,255
453686,0.0,26.0,289482.0,0,4369478.0,0.0,501564.0,255
453687,2463419.0,0.0,292431.0,0,16547952.0,8.0,102.0,255


In [10]:
original_data

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability
0,2018880.0,5.0,124974.0,0.0,3075568.0,7.0,329097.0,255.0
1,3060451.0,7.0,193464.0,0.0,3041142.0,7.0,261292.0,255.0
2,1824393.0,4.0,177426.0,0.0,2989099.0,7.0,237856.0,255.0
3,2239501.0,5.0,132391.0,0.0,4233590.0,10.0,372600.0,255.0
4,3018871.0,7.0,158834.0,0.0,3501113.0,8.0,302182.0,255.0
...,...,...,...,...,...,...,...,...
453684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0
453685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0
453686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0
453687,13710092.0,34.0,1061918.0,0.0,13092896.0,33.0,1139305.0,255.0


# Single dim query

## Entropy (lower the better, ideal <0.05 for each dim)

In [14]:
# Calculate entropy  
def get_entropy(df):  
    return df.apply(lambda x: scipy.stats.entropy(x.value_counts()))  
  
# Calculate entropy for both dataframes  
original_entropy = get_entropy(original_data)  
synthetic_entropy = get_entropy(synthetic_data)  
  
# Calculate the difference in entropy  
entropy_difference = np.abs(original_entropy - synthetic_entropy)/original_entropy  
  
# Convert to DataFrame and transpose  
entropy_difference_df = pd.DataFrame(entropy_difference).transpose()  
entropy_difference_df.index = ['entropy_difference'] 
entropy_difference_df

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability
entropy_difference,0.293825,0.314673,0.079242,0.009235,0.246496,0.317701,0.160495,0.255585


## Top-K (lower the better, ideal<0.1 for each dim)

In [15]:
# Top-K query  
K = 1000 
  
# Define a function to get top-K values  
def get_top_k_values(df, k):  
    return df.apply(lambda x: x.value_counts().index[:k].to_list())  
  
# Get top-K values from both dataframes  
original_top_k = get_top_k_values(original_data, K)   
synthetic_top_k = get_top_k_values(synthetic_data, K)  

recall_scores = []  
  
for a, b in zip(original_top_k, synthetic_top_k):  
    a_set = set(a)  
    b_set = set(b)  
    intersection = a_set & b_set  
    recall = len(intersection) / len(a_set)  
    recall_scores.append(recall)  
    
recall_scores

# Convert to DataFrame and transpose  
top_k_difference_df = pd.DataFrame([recall_scores], columns=original_data.columns)  
top_k_difference_df.index = ['top_k_recall']
top_k_difference_df['mean'] = top_k_difference_df.mean(axis=1)
top_k_difference_df

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability,mean
top_k_recall,0.025,0.319444,0.038,1.0,0.034,0.315789,0.022,1.0,0.344279


In [16]:
synthetic_top_k

input-data-rate       [0.0, 612.0, 914.0, 701.0, 718.0, 415.0, 755.0...
input-load            [0.0, 9.0, 6.0, 13.0, 36.0, 45.0, 35.0, 21.0, ...
input-packet-rate     [0.0, 4.0, 11.0, 14.0, 1.0, 21.0, 12.0, 7.0, 6...
load-interval                                                    [0, 9]
output-data-rate      [0.0, 546.0, 507.0, 106.0, 338.0, 310.0, 298.0...
output-load           [0.0, 8.0, 12.0, 35.0, 4.0, 42.0, 5.0, 26.0, 3...
output-packet-rate    [0.0, 81.0, 71.0, 86.0, 65.0, 84.0, 75.0, 66.0...
reliability                                        [255, 253, 254, 252]
dtype: object

## Quantile (lower the better, ideal <0.1 for each dim)

In [18]:
# Create an empty Series to store the results  
ks_results = pd.Series(name='Max-quantile-difference')  
  
# Iterate over columns in original_data  
for column in original_data.columns:  
    # If the same column exists in synthetic_data  
    if column in synthetic_data.columns:  
        # Perform KS test  
        statistic, pvalue = ks_2samp(original_data[column], synthetic_data[column])  
        # Append the results to the ks_results series  
        ks_results[column] = statistic  

# Convert the Series to a DataFrame  
ks_results = ks_results.to_frame().T  
# Print the resulting dataframe  
ks_results

  ks_results = pd.Series(name='Max-quantile-difference')


Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability
Max-quantile-difference,0.280507,0.101865,0.18935,0.003807,0.240574,0.123677,0.364474,0.00447


In [21]:
# Concatenate all dataframes  
single_dim_result_df = pd.concat([ks_results, top_k_difference_df, entropy_difference_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
single_dim_result_df['mean'] = single_dim_result_df.mean(axis=1) 
single_dim_result_df['std'] = single_dim_result_df.std(axis=1)
single_dim_result_df = single_dim_result_df.round(3)
single_dim_result_df
# all_values = single_dim_result_df.values.tolist()  
# print(all_values)

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability,mean,std
Max-quantile-difference,0.281,0.102,0.189,0.004,0.241,0.124,0.364,0.004,0.164,0.121
top_k_recall,0.025,0.319,0.038,1.0,0.034,0.316,0.022,1.0,0.344,0.396
entropy_difference,0.294,0.315,0.079,0.009,0.246,0.318,0.16,0.256,0.21,0.108


# Cross-dim query (Use 2-dim)

## Cardinality (lower the better, ideal <0.1 for each dim)

In [22]:
def calculate_2d_cardinality(df):  
    # Get all combinations of 2 dimensions  
    combinations = list(itertools.combinations(df.columns, 2))  
  
    cardinalities = {}  
    for combo in combinations:  
        # Calculate 2-dimension cardinality for each combination  
        cardinalities[combo] = df.groupby(list(combo)).size().reset_index().rename(columns={0:'count'}).shape[0]  
          
    return cardinalities  
  
# Calculate 2-dimension cardinality for each df  
original_cardinality = calculate_2d_cardinality(original_data)  
syn_cardinality = calculate_2d_cardinality(synthetic_data)  
original_cardinality

# Calculate error rate  
error_rate = {key: (abs(syn_cardinality[key] - original_cardinality[key]) / original_cardinality[key]) for key in original_cardinality.keys()}  
error_rate
# # Create a DataFrame for the accuracy  
two_dim_cardinality_error_rate_df = pd.DataFrame.from_dict(error_rate, orient='index', columns=['cardinality_err_rate']).T  
two_dim_cardinality_error_rate_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
cardinality_err_rate,0.093445,0.523656,0.026207,0.551707,0.073034,0.590117,0.065664,0.269649,0.470588,0.055322,...,0.032377,0.483871,0.457657,0.6,0.078999,0.539889,0.072077,0.338136,0.088608,0.556998


## L2 norm (lower the better, ideal <0.05 for each dim)

In [24]:
from sklearn.preprocessing import MinMaxScaler    
  
def calculate_l2_norm_error_2_dim(original_df, syn_df):    
    scaler = MinMaxScaler()    
  
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:    
        original_data_norm = scaler.fit_transform(original_df[list(column_comb)])    
        syn_data_norm = scaler.transform(syn_df[list(column_comb)])    
  
        l2_norm_original = np.linalg.norm(original_data_norm, 2)    
        l2_norm_syn = np.linalg.norm(syn_data_norm, 2)    
  
        l2_norm_error = abs(l2_norm_syn - l2_norm_original) / l2_norm_original    
        error_dict[column_comb] = l2_norm_error    
  
    error_df = pd.DataFrame.from_dict(error_dict, orient='index', columns=['l2_norm_error_2d']).T     
  
    return error_df    
  
l2_error_df = calculate_l2_norm_error_2_dim(original_data, synthetic_data)   
l2_error_df


Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
l2_norm_error_2d,0.184356,0.180997,0.013501,0.200444,0.18027,0.169497,0.001918,0.186488,0.013015,0.205634,...,0.013004,0.012972,0.013249,0.001533,0.218214,0.219465,0.002391,0.187621,0.00206,0.002042


## Frequency estimation (lower the better, ideal <0.1 for each dim)

In [25]:
def calculate_frequency_error(original_df, syn_df):    
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:
        # calculate the frequencies of unique values in each pair of columns
        original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)  
        syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)  
  
        # Ensure both frequency distributions have the same index for comparison  
        all_index = original_freq.index.union(syn_freq.index)  
        original_freq = original_freq.reindex(all_index, fill_value=0)  
        syn_freq = syn_freq.reindex(all_index, fill_value=0)  
  
        # Calculate frequency estimation error, 
        # Divide by 2 is to account for over-counting, because the sum of differences in a distribution always sums up to 2  
        freq_error = np.abs(original_freq - syn_freq).sum() / 2  
        error_dict[column_comb] = freq_error  
  
    error_df =  pd.DataFrame.from_dict(error_dict, orient='index', columns=['freq_err_2d']).T  
  
    return error_df    
  
freq_error_df = calculate_frequency_error(original_data, synthetic_data)  
freq_error_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
freq_err_2d,0.610356,0.730949,0.409801,0.771354,0.594097,0.842628,0.412269,0.497438,0.209747,0.588438,...,0.386664,0.192471,0.440913,0.004502,0.573614,0.814697,0.388378,0.598936,0.19507,0.441587


In [26]:
# Concatenate all dataframes  
cross_dim_result_df = pd.concat([two_dim_cardinality_error_rate_df, l2_error_df, freq_error_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
cross_dim_result_df['mean'] = cross_dim_result_df.mean(axis=1) 
cross_dim_result_df['std'] = cross_dim_result_df.std(axis=1)  
cross_dim_result_df = cross_dim_result_df.round(2)
cross_dim_result_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)",mean,std
cardinality_err_rate,0.09,0.52,0.03,0.55,0.07,0.59,0.07,0.27,0.47,0.06,...,0.46,0.6,0.08,0.54,0.07,0.34,0.09,0.56,0.31,0.2
l2_norm_error_2d,0.18,0.18,0.01,0.2,0.18,0.17,0.0,0.19,0.01,0.21,...,0.01,0.0,0.22,0.22,0.0,0.19,0.0,0.0,0.11,0.09
freq_err_2d,0.61,0.73,0.41,0.77,0.59,0.84,0.41,0.5,0.21,0.59,...,0.44,0.0,0.57,0.81,0.39,0.6,0.2,0.44,0.48,0.21


In [72]:
cross_all_values = cross_dim_result_df.values.tolist()  
print(cross_all_values)

[[0.08], [0.0], [0.44]]
