In [None]:
'''
    This notebook generates pertubed datasets for statistical method.
'''

In [None]:
import pickle
import os
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from utils.encode_datasets import read_and_encode_dataset

from pure_ldp.frequency_oracles.local_hashing import LHClient, LHServer
from pure_ldp.frequency_oracles.unary_encoding import UEClient, UEServer
from mechanisms.exponential.exponential_mechanism import Exponential_mechanism

from pure_frequency_oracles.GRR import GRR_Client, GRR_Aggregator_MI
from pure_frequency_oracles.SS import SS_Client, SS_Aggregator_MI
from pure_frequency_oracles.HE import HE_Client, HE_Aggregator_MI
from privacy_leakage.attackers import attack_lh, attack_ss, attack_ue, attack_she

In [None]:
NUM_PROCESS = 1

EPS_ARRAY = [1, 2]

def exp_aggregator(reports, k):
    freq_array = np.zeros(k)

    for i in reports:
        freq_array[i] += 1
    
    return freq_array

def data_generator(parameter):
    attribute_name, eps, repetitions, mechanism = parameter
    data_values = data[attribute_name].values
    alphabet = np.sort(np.unique(data_values))
    state_count = len(alphabet)
    states = list(alphabet)
    sample_count = len(data_values)
    d = state_count

    if mechanism == 'olh':
        client = LHClient(epsilon=eps, d=d, use_olh=True)
        server = LHServer(epsilon=eps, d=d, use_olh=True)
        g = int(np.round(np.exp(eps))) + 1
    elif mechanism == 'blh':
        client = LHClient(epsilon=eps, d=d, use_olh=False)
        server = LHServer(epsilon=eps, d=d, use_olh=False)
        g = 2
    elif mechanism == 'oue':
        client = UEClient(epsilon=eps, d=d, use_oue=True)
        server = UEServer(epsilon=eps, d=d, use_oue=True)
    elif mechanism == 'rappor':
        client = UEClient(epsilon=eps, d=d, use_oue=False)
        server = UEServer(epsilon=eps, d=d, use_oue=False)
    elif mechanism == 'exp':
        prior_dist = np.ones((state_count))/state_count
        error_matrix = np.ones((state_count, state_count)) - np.identity(state_count)
        client = Exponential_mechanism(STATE_COUNT=state_count, INPUT_ALPHABET=states, prior_dist=prior_dist, normalized_objective_err_matrix=error_matrix)

    output_list = []
    attack_output_list = []

    is_computed_utility = False

    actual_freq = []
    estimated_freq = []

    for j in range(repetitions):
        for i in range(len(data_values)):
            
            if mechanism == 'olh' or mechanism == 'blh':
                priv_data = client.privatise(data_values[i]+1)
                perturbed_output = priv_data[0]
                output_list.append(perturbed_output)
                server.aggregate(priv_data)
                attack_output_list.append(attack_lh(val_seed=priv_data, k=d, g=g))

            elif mechanism == 'oue' or mechanism == 'rappor':
                priv_data = client.privatise(data_values[i]+1)
                output_list.append(priv_data)
                server.aggregate(priv_data)
                attack_output_list.append(attack_ue(ue_val=priv_data, k=d))

            elif mechanism == 'grr':
                priv_data = GRR_Client(input_data=data_values[i], k=d, epsilon=eps) 
                output_list.append(priv_data)
                attack_output_list.append(priv_data)

            elif mechanism == 'ss':
                priv_data = SS_Client(input_data=data_values[i], k=d, epsilon=eps)
                output_list.append(priv_data)
                attack_output_list.append(attack_ss(priv_data))

            elif mechanism == 'she':
                priv_data = HE_Client(input_data=data_values[i], k=d, epsilon=eps)
                output_list.append(priv_data)
                attack_output_list.append(attack_she(y=priv_data, k=d, epsilon=eps))

            elif mechanism == 'exp':
                priv_data = client.gen_random_output(actual_value=data_values[i], eps=eps)[0]
                output_list.append(priv_data)
                attack_output_list.append(priv_data)

        if not(is_computed_utility):
            if mechanism == "grr":
                estimated_freq = list(GRR_Aggregator_MI(reports=output_list, k=d, epsilon=eps)*sample_count)
            elif mechanism == "she":
                estimated_freq = list(HE_Aggregator_MI(reports=output_list, k=d, epsilon=eps, use_thresh=False)*sample_count)
            elif mechanism == "ss":
                estimated_freq = list(SS_Aggregator_MI(reports=output_list, k=d, epsilon=eps)*sample_count)
            elif mechanism == "exp":
                estimated_freq = list(exp_aggregator(reports=output_list, k=d))

            for val in states:
                if mechanism == "olh" or mechanism == "blh" or mechanism == "oue" or mechanism == "rappor":
                    estimated_freq.append(server.estimate(val+1))

                count_ = 0
                for data_i in data_values:
                    if (data_i) == (val):
                        count_ += 1
                actual_freq.append(count_)    

        is_computed_utility = True
    
    return output_list, attribute_name, actual_freq, estimated_freq, attack_output_list

In [None]:
r = 5
datasets = ['celeba']
num_repititions = {'celeba': r}
mechanisms = ["grr"]

for dataset_name in datasets:
    
    data, COLUMNS = read_and_encode_dataset(dataset_name=dataset_name)
    NUM_REPITTIONS = num_repititions[dataset_name]
    NUM_ATTRIBUTES = len(COLUMNS)

    for mechanism in mechanisms:
        file_location = f"perturbed_datasets_benchmark/{dataset_name}/{mechanism}"
        list_available_files = os.listdir(file_location)

        for eps in EPS_ARRAY:
            final_perturbed_data_list = []
            actual_frequencies_list = []
            estimated_frequencies_list = []
            last_index = 0
            attack_output_list = []

            while last_index < NUM_ATTRIBUTES:
                sub_attribute_list = [] 
                for i in range(NUM_PROCESS):
                    actual_index = last_index + i
                    if actual_index >= NUM_ATTRIBUTES:
                        break
                    sub_attribute_list.append(COLUMNS[actual_index])

                num_processes = min(NUM_PROCESS, len(sub_attribute_list))

                with ProcessPoolExecutor(max_workers=num_processes) as executor:
                    futures = [executor.submit(data_generator, (sub_attribute_list[j], eps, NUM_REPITTIONS, mechanism)) for j in range(num_processes)]
                    results_ = [future.result() for future in futures]

                for k in sub_attribute_list:
                    for r in results_:
                        if r[1] == k:
                            attack_output_list.append(r[4])
                            final_perturbed_data_list.append(r[0])
                            actual_frequencies_list.append(r[2])
                            estimated_frequencies_list.append(r[3])
                            break
                last_index += len(sub_attribute_list)
            attack_output_list = np.transpose(np.array(attack_output_list))

            data_frame2 = pd.DataFrame(attack_output_list, columns=COLUMNS)
            data_frame2.to_csv(f'{file_location}/attack_{eps}.csv.zip', compression='zip', index=False)
            with open(f'{file_location}/{eps}_freq.pkl', 'wb') as file:
                pickle.dump({"actual_frequencies_list": actual_frequencies_list, "estimated_frequencies_list": estimated_frequencies_list}, file)
            