In [1]:
import scipy.stats  
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp  
import itertools
from gputils.preproc import nvt_read_data
import dask.dataframe as dd
from dask.distributed import Client, get_client
from tqdm import tqdm
import cupyx.scipy.stats
import dask_cudf

In [6]:
original_files = '/home/azureuser/datadrive/traffic/'
syn_files = '/home/azureuser/datadrive/syn_traffic/*.csv'

original_data = dask_cudf.read_parquet(original_files).round(0).drop(columns='time')
synthetic_data = dask_cudf.read_csv(syn_files)

In [10]:
synthetic_data.head(10)

Unnamed: 0,input_data_rate,output_data_rate,input_load,output_load,input_packet_rate,output_packet_rate,reliability
0,2976866.0,3044390.0,29.0,29.0,245254.0,237932.0,252.0
1,3000602.0,3002950.0,27.0,31.0,253057.0,244706.0,252.0
2,3095399.0,3001909.0,31.0,29.0,254526.0,254934.0,252.0
3,2927678.0,2980436.0,31.0,29.0,251725.0,251310.0,252.0
4,3053583.0,2982080.0,31.0,32.0,244156.0,253437.0,252.0
5,3013629.0,3015571.0,30.0,32.0,249795.0,255210.0,252.0
6,2983374.0,3039787.0,29.0,31.0,243151.0,245052.0,252.0
7,2947870.0,3049738.0,28.0,29.0,248948.0,249912.0,252.0
8,3035790.0,3030820.0,30.0,32.0,247747.0,254608.0,252.0
9,3020942.0,2955278.0,28.0,30.0,248779.0,240569.0,252.0


In [11]:
original_data.head(10)

Unnamed: 0,input_data_rate,output_data_rate,input_load,output_load,input_packet_rate,output_packet_rate,reliability
0,3013586.0,2721197.0,26.0,11.0,238418.0,220117.0,250.0
1,2583630.0,2614132.0,52.0,44.0,346700.0,278477.0,251.0
2,3126824.0,3243400.0,7.0,52.0,332892.0,252264.0,252.0
3,2274835.0,3130986.0,9.0,52.0,174969.0,298775.0,253.0
4,2915212.0,2936739.0,3.0,20.0,192884.0,324708.0,250.0
5,3320341.0,3755739.0,0.0,59.0,228742.0,246624.0,254.0
6,3200723.0,2201120.0,9.0,41.0,262320.0,235836.0,251.0
7,3365179.0,3336088.0,13.0,27.0,209723.0,291895.0,253.0
8,2995232.0,3358239.0,10.0,13.0,293973.0,216518.0,251.0
9,2408799.0,3108842.0,53.0,49.0,264071.0,343479.0,250.0


In [7]:
synthetic_data['reliability'].unique().compute()

0    253.0
0    252.0
0    251.0
Name: reliability, dtype: float64

# Single dim query

## Entropy (lower the better, ideal <0.05 for each dim)

In [15]:
# Calculate entropy and top k.
def get_entropy_topk(ddf, k):  
    columns = list(ddf.columns)
    entropies = []
    top_ks = []
    for col in tqdm(columns):
        val_counts = ddf[col].value_counts().compute()
        entropy = cupyx.scipy.stats.entropy(val_counts.values)
        top_k = val_counts[:k].to_arrow().to_pylist()
        entropies.append(entropy)
        top_ks.append(top_k)
    return entropies, top_ks

K = 1000
# Calculate entropy for both dataframes  
original_entropy, original_top_k = get_entropy_topk(original_data, K)  
synthetic_entropy, synthetic_top_k = get_entropy_topk(synthetic_data, K)  

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [18:58<00:00, 162.58s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [15:20<00:00, 131.56s/it]


In [29]:
og_arr = np.array([x.item() for x in original_entropy])
syn_arr = np.array([x.item() for x in synthetic_entropy])
# Calculate the difference in entropy  
entropy_difference = np.abs(og_arr - syn_arr)/og_arr  
  
# Convert to DataFrame and transpose  
entropy_difference_df = pd.DataFrame(entropy_difference).transpose()  
entropy_difference_df.index = ['entropy_difference'] 
entropy_difference_df.columns = original_data.columns
entropy_difference_df

Unnamed: 0,input_data_rate,output_data_rate,input_load,output_load,input_packet_rate,output_packet_rate,reliability
entropy_difference,0.163247,0.148986,0.571055,0.566681,0.185447,0.195667,0.997828


## Top-K (lower the better, ideal<0.1 for each dim)

In [35]:
[len(x) for x in synthetic_top_k]

[1000, 1000, 15, 15, 1000, 1000, 3]

In [37]:
recall_scores = []  
  
for a, b in zip(original_top_k, synthetic_top_k):  
    a_set = set(a)  
    b_set = set(b)  
    intersection = a_set & b_set  
    recall = len(intersection) / len(a_set)  
    recall_scores.append(recall)  
    
recall_scores

# Convert to DataFrame and transpose  
top_k_difference_df = pd.DataFrame([recall_scores], columns=original_data.columns)  
top_k_difference_df.index = ['top_k_recall']
top_k_difference_df['mean'] = top_k_difference_df.mean(axis=1)
top_k_difference_df

Unnamed: 0,input_data_rate,output_data_rate,input_load,output_load,input_packet_rate,output_packet_rate,reliability,mean
top_k_recall,0.246777,0.244444,0.0,0.0,0.110345,0.115599,0.0,0.102452


In [16]:
synthetic_top_k

input-data-rate       [0.0, 612.0, 914.0, 701.0, 718.0, 415.0, 755.0...
input-load            [0.0, 9.0, 6.0, 13.0, 36.0, 45.0, 35.0, 21.0, ...
input-packet-rate     [0.0, 4.0, 11.0, 14.0, 1.0, 21.0, 12.0, 7.0, 6...
load-interval                                                    [0, 9]
output-data-rate      [0.0, 546.0, 507.0, 106.0, 338.0, 310.0, 298.0...
output-load           [0.0, 8.0, 12.0, 35.0, 4.0, 42.0, 5.0, 26.0, 3...
output-packet-rate    [0.0, 81.0, 71.0, 86.0, 65.0, 84.0, 75.0, 66.0...
reliability                                        [255, 253, 254, 252]
dtype: object

## Quantile (lower the better, ideal <0.1 for each dim)

In [7]:
# Create an empty Series to store the results  
ks_results = pd.Series(name='Max-quantile-difference')  
  
# Iterate over columns in original_data  
for column in original_data.columns:  
    # If the same column exists in synthetic_data  
    if column in synthetic_data.columns:  
        # Perform KS test  
        statistic, pvalue = ks_2samp(original_data[column], synthetic_data[column])  
        # Append the results to the ks_results series  
        ks_results[column] = statistic  

# Convert the Series to a DataFrame  
ks_results = ks_results.to_frame().T  
# Print the resulting dataframe  
ks_results

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability
Max-quantile-difference,0.366449,0.30181,0.378308,0.015072,0.37751,0.316922,0.40273,0.000672


In [8]:
# Concatenate all dataframes  
single_dim_result_df = pd.concat([ks_results, top_k_difference_df, entropy_difference_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
single_dim_result_df['mean'] = single_dim_result_df.mean(axis=1) 
single_dim_result_df['std'] = single_dim_result_df.std(axis=1)
single_dim_result_df = single_dim_result_df.round(3)
single_dim_result_df
# all_values = single_dim_result_df.values.tolist()  
# print(all_values)

Unnamed: 0,input-data-rate,input-load,input-packet-rate,load-interval,output-data-rate,output-load,output-packet-rate,reliability,mean,std
Max-quantile-difference,0.436,0.077,0.353,0.0,0.435,0.082,0.434,0.001,0.227,0.191
top_k_recall,0.013,0.167,0.015,1.0,0.0,0.145,0.006,1.0,0.293,0.413
entropy_difference,0.286,0.287,0.043,0.0,0.321,0.34,0.01,0.055,0.168,0.143


# Cross-dim query (Use 2-dim)

## Cardinality (lower the better, ideal <0.1 for each dim)

In [8]:
def calculate_2d_cardinality(df):  
    # Get all combinations of 2 dimensions  
    combinations = list(itertools.combinations(df.columns, 2))  
  
    cardinalities = {}  
    for combo in combinations:  
        # Calculate 2-dimension cardinality for each combination  
        cardinalities[combo] = df.groupby(list(combo)).size().reset_index().rename(columns={0:'count'}).shape[0]  
          
    return cardinalities  
  
# Calculate 2-dimension cardinality for each df  
original_cardinality = calculate_2d_cardinality(original_data)  
syn_cardinality = calculate_2d_cardinality(synthetic_data)  
original_cardinality

# Calculate error rate  
error_rate = {key: (abs(syn_cardinality[key] - original_cardinality[key]) / original_cardinality[key]) for key in original_cardinality.keys()}  
error_rate
# # Create a DataFrame for the accuracy  
two_dim_cardinality_error_rate_df = pd.DataFrame.from_dict(error_rate, orient='index', columns=['cardinality_err_rate']).T  
two_dim_cardinality_error_rate_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
cardinality_err_rate,0.649644,0.671023,0.653315,0.66469,0.626234,0.697926,0.6334,0.13488,0.752941,0.643094,...,0.672746,0.698925,0.559249,0.6,0.676404,0.70449,0.653166,0.256255,1.101266,0.385698


## L2 norm (lower the better, ideal <0.05 for each dim)

In [11]:
from sklearn.preprocessing import MinMaxScaler    
  
def calculate_l2_norm_error_2_dim(original_df, syn_df):    
    scaler = MinMaxScaler()    
  
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:    
        original_data_norm = scaler.fit_transform(original_df[list(column_comb)])    
        syn_data_norm = scaler.transform(syn_df[list(column_comb)])    
  
        l2_norm_original = np.linalg.norm(original_data_norm, 2)    
        l2_norm_syn = np.linalg.norm(syn_data_norm, 2)    
  
        l2_norm_error = abs(l2_norm_syn - l2_norm_original) / l2_norm_original    
        error_dict[column_comb] = l2_norm_error    
  
    error_df = pd.DataFrame.from_dict(error_dict, orient='index', columns=['l2_norm_error_2d']).T     
  
    return error_df    
  
l2_error_df = calculate_l2_norm_error_2_dim(original_data, synthetic_data)   
l2_error_df


Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
l2_norm_error_2d,0.253765,0.251134,0.034358,0.242509,0.239434,0.239807,0.001273,0.252432,0.034873,0.24445,...,0.034291,0.034795,0.03401,0.001377,0.245345,0.244757,0.001274,0.24064,0.001322,0.000927


## Frequency estimation (lower the better, ideal <0.1 for each dim)

In [9]:
def calculate_frequency_error(original_df, syn_df):    
    # Getting all 2-dimensional combinations of columns  
    column_combinations = list(itertools.combinations(original_df.columns, 2))  
  
    error_dict = {}    
    for column_comb in column_combinations:
        # calculate the frequencies of unique values in each pair of columns
        original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)  
        syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)  
  
        # Ensure both frequency distributions have the same index for comparison  
        all_index = original_freq.index.union(syn_freq.index)  
        original_freq = original_freq.reindex(all_index, fill_value=0)  
        syn_freq = syn_freq.reindex(all_index, fill_value=0)  
  
        # Calculate frequency estimation error, 
        # Divide by 2 is to account for over-counting, because the sum of differences in a distribution always sums up to 2  
        freq_error = np.abs(original_freq - syn_freq).sum() / 2  
        error_dict[column_comb] = freq_error  
  
    error_df =  pd.DataFrame.from_dict(error_dict, orient='index', columns=['freq_err_2d']).T  
  
    return error_df    
  
freq_error_df = calculate_frequency_error(original_data, synthetic_data)  
freq_error_df

  original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)
  syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)
  original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)
  syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)
  original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)
  syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)
  original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)
  syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)
  original_freq = pd.value_counts(original_df[list(column_comb)].values.flatten(), normalize=True)
  syn_freq = pd.value_counts(syn_df[list(column_comb)].values.flatten(), normalize=True)
  original_freq = pd.value_counts(original_df[list(column_co

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-data-rate)","(load-interval, output-load)","(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)"
freq_err_2d,0.715598,0.822828,0.452636,0.892098,0.711538,0.83909,0.452972,0.640479,0.266741,0.72062,...,0.458867,0.26268,0.404514,0.007872,0.716558,0.844549,0.459203,0.651394,0.263016,0.404846


In [12]:
# Concatenate all dataframes  
cross_dim_result_df = pd.concat([two_dim_cardinality_error_rate_df, l2_error_df, freq_error_df])  
# Create a new column 'mean' and 'std' that contains the average value of each row  
cross_dim_result_df['mean'] = cross_dim_result_df.mean(axis=1) 
cross_dim_result_df['std'] = cross_dim_result_df.std(axis=1)  
cross_dim_result_df = cross_dim_result_df.round(2)
cross_dim_result_df

Unnamed: 0,"(input-data-rate, input-load)","(input-data-rate, input-packet-rate)","(input-data-rate, load-interval)","(input-data-rate, output-data-rate)","(input-data-rate, output-load)","(input-data-rate, output-packet-rate)","(input-data-rate, reliability)","(input-load, input-packet-rate)","(input-load, load-interval)","(input-load, output-data-rate)",...,"(load-interval, output-packet-rate)","(load-interval, reliability)","(output-data-rate, output-load)","(output-data-rate, output-packet-rate)","(output-data-rate, reliability)","(output-load, output-packet-rate)","(output-load, reliability)","(output-packet-rate, reliability)",mean,std
cardinality_err_rate,0.65,0.67,0.65,0.66,0.63,0.7,0.63,0.13,0.75,0.64,...,0.56,0.6,0.68,0.7,0.65,0.26,1.1,0.39,0.6,0.22
l2_norm_error_2d,0.25,0.25,0.03,0.24,0.24,0.24,0.0,0.25,0.03,0.24,...,0.03,0.0,0.25,0.24,0.0,0.24,0.0,0.0,0.14,0.11
freq_err_2d,0.72,0.82,0.45,0.89,0.71,0.84,0.45,0.64,0.27,0.72,...,0.4,0.01,0.72,0.84,0.46,0.65,0.26,0.4,0.55,0.22


In [13]:
cross_all_values = cross_dim_result_df.values.tolist()  
print(cross_all_values)

[[0.65, 0.67, 0.65, 0.66, 0.63, 0.7, 0.63, 0.13, 0.75, 0.64, 0.63, 0.25, 1.08, 0.48, 0.68, 0.19, 0.72, 0.29, 0.67, 0.7, 0.56, 0.6, 0.68, 0.7, 0.65, 0.26, 1.1, 0.39, 0.6, 0.22], [0.25, 0.25, 0.03, 0.24, 0.24, 0.24, 0.0, 0.25, 0.03, 0.24, 0.25, 0.24, 0.0, 0.03, 0.24, 0.24, 0.24, 0.0, 0.03, 0.03, 0.03, 0.0, 0.25, 0.24, 0.0, 0.24, 0.0, 0.0, 0.14, 0.11], [0.72, 0.82, 0.45, 0.89, 0.71, 0.84, 0.45, 0.64, 0.27, 0.72, 0.53, 0.66, 0.27, 0.39, 0.83, 0.64, 0.73, 0.39, 0.46, 0.26, 0.4, 0.01, 0.72, 0.84, 0.46, 0.65, 0.26, 0.4, 0.55, 0.22]]
