In [None]:
import random
import warnings
from itertools import combinations, product, islice, chain
from math import log, ceil
from multiprocessing.pool import Pool
from enum import Enum
import numpy as np
from string import ascii_lowercase
import pandas as pd
from pandas import Series, DataFrame
from pandas import merge
from scipy.optimize import fsolve
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score
from abc import ABCMeta, abstractmethod
from bisect import bisect_right
from random import uniform
from typing import List, Union
from numpy.random import choice
import json
from pathlib import Path
from typing import Dict, List, Union
from numpy import array_equal
from pandas import DataFrame, read_csv

In [None]:
class DataType(Enum):
    INTEGER = 'Integer'
    FLOAT = 'Float'
    STRING = 'String'
    DATETIME = 'DateTime'
    SOCIAL_SECURITY_NUMBER = 'SocialSecurityNumber'

In [None]:
def pairwise_attributes_mutual_information(dataset):
    """Compute normalized mutual information for all pairwise attributes. Return a DataFrame."""
    sorted_columns = sorted(dataset.columns)
    mi_df = DataFrame(columns=sorted_columns, index=sorted_columns, dtype=float)
    for row in mi_df.columns:
        for col in mi_df.columns:
            mi_df.loc[row, col] = normalized_mutual_info_score(dataset[row].astype(str),
                                                               dataset[col].astype(str),
                                                               average_method='arithmetic')
    return mi_df
def read_json_file(json_file):
    with open(json_file, 'r') as file:
        return json.load(file)
    
def infer_numerical_attributes_in_dataframe(dataframe):
    describe = dataframe.describe()
    # DataFrame.describe() usually returns 8 rows.
    if describe.shape[0] == 8:
        return set(describe.columns)
    # DataFrame.describe() returns less than 8 rows when there is no numerical attribute.
    else:
        return set()
def generate_random_string(length):
    return ''.join(np.random.choice(list(ascii_lowercase), size=length))
def set_random_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
def mutual_information(labels_x: Series, labels_y: DataFrame):
# Mutual information of distributions in format of Series or DataFrame using inbuilt mutual_info_score lib fromsklearn.metrics
#   Parameters
# ----------
# labels_x : Series
#labels_y : DataFrame
    if labels_y.shape[1] == 1:
        labels_y = labels_y.iloc[:, 0]
    else:
        labels_y = labels_y.apply(lambda x: ' '.join(x.values), axis=1)

    return mutual_info_score(labels_x, labels_y)
def normalize_given_distribution(frequencies):
    distribution = np.array(frequencies, dtype=float)
    distribution = distribution.clip(0)  # replace negative values with 0
    summation = distribution.sum()
    if summation > 0:
        if np.isinf(summation):
            return normalize_given_distribution(np.isinf(distribution))
        else:
            return distribution / summation
    else:
        return np.full_like(distribution, 1 / distribution.size)


The Format in which we save the info of the attribute 

In [None]:
class AbstractAttribute(object):
    __metaclass__ = ABCMeta

    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        self.name = name
        self.is_candidate_key = is_candidate_key
        self.is_categorical = is_categorical
        self.histogram_size: Union[int, str] = histogram_size
        self.data: Series = data
        self.data_dropna: Series = self.data.dropna()
        self.missing_rate: float = (self.data.size - self.data_dropna.size) / (self.data.size or 1)

        self.is_numerical: bool = None
        self.data_type: DataType = None
        self.min = None
        self.max = None
        self.distribution_bins: np.ndarray = None
        self.distribution_probabilities: np.ndarray = None

    @abstractmethod
    def infer_domain(self, categorical_domain: List = None, numerical_range: List = None):
 #       """Infer categorical_domain, including min, max, and 1-D distribution.

#        """
        if categorical_domain:
            self.min = min(categorical_domain)
            self.max = max(categorical_domain)
            self.distribution_bins = np.array(categorical_domain)
        elif numerical_range:
            self.min, self.max = numerical_range
            self.distribution_bins = np.array([self.min, self.max])
        else:
            self.min = float(self.data_dropna.min())
            self.max = float(self.data_dropna.max())
            if self.is_categorical:
                self.distribution_bins = self.data_dropna.unique()
            else:
                self.distribution_bins = np.array([self.min, self.max])

        self.distribution_probabilities = np.full_like(self.distribution_bins, 1 / self.distribution_bins.size)

    @abstractmethod
    def infer_distribution(self):
        if self.is_categorical:
            distribution = self.data_dropna.value_counts()
            for value in set(self.distribution_bins) - set(distribution.index):
                distribution[value] = 0
            distribution.sort_index(inplace=True)
            self.distribution_probabilities = utils.normalize_given_distribution(distribution)
            self.distribution_bins = np.array(distribution.index)
        else:
            distribution = np.histogram(self.data_dropna, bins=self.histogram_size, range=(self.min, self.max))
            self.distribution_bins = distribution[1][:-1]  # Remove the last bin edge
            self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])

    def inject_laplace_noise(self, epsilon, num_valid_attributes):
        if epsilon > 0:
            sensitivity = 2 / self.data.size
            privacy_budget = epsilon / num_valid_attributes
            noise_scale = sensitivity / privacy_budget
            laplace_noises = np.random.laplace(0, scale=noise_scale, size=len(self.distribution_probabilities))
            noisy_distribution = self.distribution_probabilities + laplace_noises
            self.distribution_probabilities = utils.normalize_given_distribution(noisy_distribution)

    def encode_values_into_bin_idx(self):
#        """Encode values into bin indices for Bayesian Network construction.

 #       """
        if self.is_categorical:
            value_to_bin_idx = {value: idx for idx, value in enumerate(self.distribution_bins)}
            encoded = self.data.map(lambda x: value_to_bin_idx[x], na_action='ignore')
        else:
            encoded = self.data.map(lambda x: bisect_right(self.distribution_bins, x) - 1, na_action='ignore')

        encoded.fillna(len(self.distribution_bins), inplace=True)
        return encoded.astype(int, copy=False)

    def to_json(self):
     #   """Encode attribution information in JSON format / Python dictionary.

#        """
        return {"name": self.name,
                "data_type": self.data_type.value,
                "is_categorical": self.is_categorical,
                "is_candidate_key": self.is_candidate_key,
                "min": self.min,
                "max": self.max,
                "missing_rate": self.missing_rate,
                "distribution_bins": self.distribution_bins.tolist(),
                "distribution_probabilities": self.distribution_probabilities.tolist()}

    @abstractmethod
    def generate_values_as_candidate_key(self, n):
        
        return np.arange(n)

    def sample_binning_indices_in_independent_attribute_mode(self, n):
    #Sample an array of binning indices.

        
        return Series(choice(len(self.distribution_probabilities), size=n, p=self.distribution_probabilities))

    @abstractmethod
    def sample_values_from_binning_indices(self, binning_indices):
        #Convert binning indices into values in domain. Used by both independent and correlated attribute mode.

        
        return binning_indices.apply(lambda x: self.uniform_sampling_within_a_bin(x))

    def uniform_sampling_within_a_bin(self, bin_idx: int):
        num_bins = len(self.distribution_bins)
        if bin_idx == num_bins:
            return np.nan
        elif self.is_categorical:
            return self.distribution_bins[bin_idx]
        elif bin_idx < num_bins - 1:
            return uniform(self.distribution_bins[bin_idx], self.distribution_bins[bin_idx + 1])
        else:
            # sample from the last interval where the right edge is missing in self.distribution_bins
            neg_2, neg_1 = self.distribution_bins[-2:]
            return uniform(neg_1, self.max)

In [None]:
class FloatAttribute(AbstractAttribute):
    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        super().__init__(name, is_candidate_key, is_categorical, histogram_size, data)
        self.is_numerical = True
        self.data_type = DataType.FLOAT

    def infer_domain(self, categorical_domain=None, numerical_range=None):
        super().infer_domain(categorical_domain, numerical_range)

    def infer_distribution(self):
        super().infer_distribution()

    def generate_values_as_candidate_key(self, n):
        return arange(self.min, self.max, (self.max - self.min) / n)

    def sample_values_from_binning_indices(self, binning_indices):
        return super().sample_values_from_binning_indices(binning_indices)
class IntegerAttribute(AbstractAttribute):
    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        super().__init__(name, is_candidate_key, is_categorical, histogram_size, data)
        self.is_numerical = True
        self.data_type = DataType.INTEGER

    def infer_domain(self, categorical_domain=None, numerical_range=None):
        super().infer_domain(categorical_domain, numerical_range)
        self.min = int(self.min)
        self.max = int(self.max)

    def infer_distribution(self):
        super().infer_distribution()

    def generate_values_as_candidate_key(self, n):
        return super().generate_values_as_candidate_key(n)

    def sample_values_from_binning_indices(self, binning_indices):
        column = super().sample_values_from_binning_indices(binning_indices)
        column = column.round()
        column[~column.isnull()] = column[~column.isnull()].astype(int)
        return column
def is_datetime(value: str):
 #   """Find whether a value is a datetime. Here weekdays and months are categorical values instead of datetime."""
    weekdays = {'mon', 'monday', 'tue', 'tuesday', 'wed', 'wednesday', 'thu', 'thursday', 'fri', 'friday',
                'sat', 'saturday', 'sun', 'sunday'}
    months = {'jan', 'january', 'feb', 'february', 'mar', 'march', 'apr', 'april', 'may', 'may', 'jun', 'june',
              'jul', 'july', 'aug', 'august', 'sep', 'sept', 'september', 'oct', 'october', 'nov', 'november',
              'dec', 'december'}

    value_lower = value.lower()
    if (value_lower in weekdays) or (value_lower in months):
        return False
    try:
        parse(value)
        return True
    except:
        return False


# TODO detect datetime formats
class DateTimeAttribute(AbstractAttribute):
    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        super().__init__(name, is_candidate_key, is_categorical, histogram_size, data)
        self.is_numerical = True
        self.data_type = DataType.DATETIME
        epoch_datetime = parse('1970-01-01')
        self.timestamps = self.data_dropna.map(lambda x: int((parse(x) - epoch_datetime).total_seconds()))

    def infer_domain(self, categorical_domain=None, numerical_range=None):
        if numerical_range:
            self.min, self.max = numerical_range
            self.distribution_bins = np.array([self.min, self.max])
        else:
            self.min = float(self.timestamps.min())
            self.max = float(self.timestamps.max())
            if self.is_categorical:
                self.distribution_bins = self.data_dropna.unique()
            else:
                self.distribution_bins = np.array([self.min, self.max])

        self.distribution_probabilities = np.full_like(self.distribution_bins, 1 / self.distribution_bins.size)

    def infer_distribution(self):
        if self.is_categorical:
            distribution = self.data_dropna.value_counts()
            for value in set(self.distribution_bins) - set(distribution.index):
                distribution[value] = 0
            distribution.sort_index(inplace=True)
            self.distribution_probabilities = normalize_given_distribution(distribution)
            self.distribution_bins = np.array(distribution.index)
        else:
            distribution = np.histogram(self.timestamps, bins=self.histogram_size, range=(self.min, self.max))
            self.distribution_probabilities = normalize_given_distribution(distribution[0])

    def encode_values_into_bin_idx(self):
#        """Encode values into bin indices for Bayesian Network construction.

#        """
        if self.is_categorical:
            value_to_bin_idx = {value: idx for idx, value in enumerate(self.distribution_bins)}
            encoded = self.data.map(lambda x: value_to_bin_idx[x], na_action='ignore')
        else:
            encoded = self.timestamps.map(lambda x: bisect_right(self.distribution_bins, x) - 1, na_action='ignore')
            encoded = concat([encoded, self.data], axis=1).iloc[:, 0]

        encoded.fillna(len(self.distribution_bins), inplace=True)
        return encoded.astype(int, copy=False)

    def generate_values_as_candidate_key(self, n):
        return np.arange(self.min, self.max, (self.min - self.max) / n)

    def sample_values_from_binning_indices(self, binning_indices):
        column = super().sample_values_from_binning_indices(binning_indices)
        if not self.is_categorical:
            column[~column.isnull()] = column[~column.isnull()].astype(int)
        return column
#for string attribute
class StringAttribute(AbstractAttribute):
 #   """Variable min and max are the lengths of the shortest and longest strings.

#    """

    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        super().__init__(name, is_candidate_key, is_categorical, histogram_size, data)
        self.is_numerical = False
        self.data_type = DataType.STRING
        self.data_dropna = self.data_dropna.astype(str)
        self.data_dropna_len = self.data_dropna.map(len)

    def infer_domain(self, categorical_domain=None, numerical_range=None):
        if categorical_domain:
            lengths = [len(i) for i in categorical_domain]
            self.min = min(lengths)
            self.max = max(lengths)
            self.distribution_bins = np.array(categorical_domain)
        else:
            self.min = int(self.data_dropna_len.min())
            self.max = int(self.data_dropna_len.max())
            if self.is_categorical:
                self.distribution_bins = self.data_dropna.unique()
            else:
                self.distribution_bins = np.array([self.min, self.max])

        self.distribution_probabilities = np.full_like(self.distribution_bins, 1 / self.distribution_bins.size)

    def infer_distribution(self):
        if self.is_categorical:
            distribution = self.data_dropna.value_counts()
            for value in set(self.distribution_bins) - set(distribution.index):
                distribution[value] = 0
            distribution.sort_index(inplace=True)
            self.distribution_probabilities = utils.normalize_given_distribution(distribution)
            self.distribution_bins = np.array(distribution.index)
        else:
            distribution = np.histogram(self.data_dropna_len, bins=self.histogram_size)
            self.distribution_bins = distribution[1][:-1]
            self.distribution_probabilities = utils.normalize_given_distribution(distribution[0])

    def generate_values_as_candidate_key(self, n):
        length = np.random.randint(self.min, self.max + 1)
        vectorized = np.vectorize(lambda x: '{}{}'.format(utils.generate_random_string(length), x))
        return vectorized(np.arange(n))

    def sample_values_from_binning_indices(self, binning_indices):
        column = super().sample_values_from_binning_indices(binning_indices)
        if not self.is_categorical:
            column[~column.isnull()] = column[~column.isnull()].apply(lambda x: utils.generate_random_string(int(x)))

        return column
def pre_process(column: Series):
    if column.size == 0:
        return column
    elif type(column.iloc[0]) is int:
        return column
    elif type(column.iloc[0]) is str:
        return column.map(lambda x: int(x.replace('-', '')))
    else:
        raise Exception('Invalid SocialSecurityNumber.')

#if attribute is socialsecuritynumber
def is_ssn(value):
#   Test whether a number is between 0 and 1e9.

#    Note this function does not take into consideration some special numbers that are never allocated.
 #   https://en.wikipedia.org/wiki/Social_Security_number
 ##   """
    if type(value) is int:
        return 0 < value < 1e9
    elif type(value) is str:
        value = value.replace('-', '')
        if value.isdigit():
            return 0 < int(value) < 1e9
    return False


class SocialSecurityNumberAttribute(AbstractAttribute):
#    """SocialSecurityNumber of format AAA-GG-SSSS."""

    def __init__(self, name: str, is_candidate_key, is_categorical, histogram_size: Union[int, str], data: Series):
        super().__init__(name, is_candidate_key, is_categorical, histogram_size, pre_process(data))
        self.is_numerical = True
        self.data_type = DataType.SOCIAL_SECURITY_NUMBER

    def infer_domain(self, categorical_domain=None, numerical_range=None):
        super().infer_domain(categorical_domain, numerical_range)
        self.min = int(self.min)
        self.max = int(self.max)

    def infer_distribution(self):
        super().infer_distribution()

    def generate_values_as_candidate_key(self, n):
        if n < 1e9:
            values = np.linspace(0, 1e9 - 1, num=n, dtype=int)
            values = np.random.permutation(values)
            values = [str(i).zfill(9) for i in values]
            return ['{}-{}-{}'.format(i[:3], i[3:5], i[5:]) for i in values]
        else:
            raise Exception('The candidate key "{}" cannot generate more than 1e9 distinct values.', self.name)

    def sample_values_from_binning_indices(self, binning_indices):
        return super().sample_binning_indices_in_independent_attribute_mode(binning_indices)

In [None]:
#we will change the data intojson form before using for generation
def parse_json(attribute_in_json):
    name = attribute_in_json['name']
    data_type = DataType(attribute_in_json['data_type'])
    is_candidate_key = attribute_in_json['is_candidate_key']
    is_categorical = attribute_in_json['is_categorical']
    histogram_size = len(attribute_in_json['distribution_bins'])
    if data_type is DataType.INTEGER:
        attribute = IntegerAttribute(name, is_candidate_key, is_categorical, histogram_size, Series(dtype=int))
    elif data_type is DataType.FLOAT:
        attribute = FloatAttribute(name, is_candidate_key, is_categorical, histogram_size, Series(dtype=float))
    elif data_type is DataType.DATETIME:
        attribute = DateTimeAttribute(name, is_candidate_key, is_categorical, histogram_size, Series(dtype='datetime64[ns]'))
    elif data_type is DataType.STRING:
        attribute = StringAttribute(name, is_candidate_key, is_categorical, histogram_size, Series(dtype=str))
    elif data_type is data_type.SOCIAL_SECURITY_NUMBER:
        attribute = SocialSecurityNumberAttribute(name, is_candidate_key, is_categorical, histogram_size, Series(dtype=int))
    else:
        raise Exception('Data type {} is unknown.'.format(data_type.value))

    attribute.missing_rate = attribute_in_json['missing_rate']
    attribute.min = attribute_in_json['min']
    attribute.max = attribute_in_json['max']
    attribute.distribution_bins = attribute_in_json['distribution_bins']
    attribute.distribution_probabilities = attribute_in_json['distribution_probabilities']

    return attribute

In [None]:
#Code for creating a approximate Bayesian network with noisy conditional distributions(data must be preprocessed(look into readme file))


def calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary):
#Sensitivity function for Bayesian network construction.
# num_tuples : int

    if attr_to_is_binary[child] or (len(parents) == 1 and attr_to_is_binary[parents[0]]):
        a = log(num_tuples) / num_tuples
        b = (num_tuples - 1) / num_tuples
        b_inv = num_tuples / (num_tuples - 1)
        return a + b * log(b_inv)
    else:
        a = (2 / num_tuples) * log((num_tuples + 1) / 2)
        b = (1 - 1 / num_tuples) * log(1 + 2 / (num_tuples - 1))
        return a + b


def calculate_delta(num_attributes, sensitivity, epsilon):
# Computing delta, which is a factor when applying differential privacy.
    return (num_attributes - 1) * sensitivity / epsilon


def usefulness_minus_target(k, num_attributes, num_tuples, target_usefulness=5, epsilon=0.1):
 # Usefulness function used to calculate the value of K(degree of the bayesian network)

    if k == num_attributes:
        print('here')
        usefulness = target_usefulness
    else:
        usefulness = num_tuples * epsilon / ((num_attributes - k) * (2 ** (k + 3)))  
    return usefulness - target_usefulness


def calculate_k(num_attributes, num_tuples, target_usefulness=4, epsilon=0.1):
# Calculate the maximum degree when constructing Bayesian networks. 
    default_k = 3
    initial_usefulness = usefulness_minus_target(default_k, num_attributes, num_tuples, 0, epsilon)
    if initial_usefulness > target_usefulness:
        return default_k
    else:
        arguments = (num_attributes, num_tuples, target_usefulness, epsilon)
        warnings.filterwarnings("error")
        try:
            ans = fsolve(usefulness_minus_target, np.array([int(num_attributes / 2)]), args=arguments)[0]
            ans = ceil(ans)
        except RuntimeWarning:
            print("k is not properly computed!")
            ans = default_k
        if ans < 1 or ans > num_attributes:
            ans = default_k
        return ans


def set_random_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
def mutual_information(labels_x: Series, labels_y: DataFrame):
# Mutual information of distributions in format of Series or DataFrame using inbuilt mutual_info_score lib fromsklearn.metrics
#   Parameters
# ----------
# labels_x : Series
#labels_y : DataFrame
    if labels_y.shape[1] == 1:
        labels_y = labels_y.iloc[:, 0]
    else:
        labels_y = labels_y.apply(lambda x: ' '.join(x.values), axis=1)

    return mutual_info_score(labels_x, labels_y)
def normalize_given_distribution(frequencies):
    distribution = np.array(frequencies, dtype=float)
    distribution = distribution.clip(0)  # replace negative values with 0
    summation = distribution.sum()
    if summation > 0:
        if np.isinf(summation):
            return normalize_given_distribution(np.isinf(distribution))
        else:
            return distribution / summation
    else:
        return np.full_like(distribution, 1 / distribution.size)



def worker(paras):
    child, V, num_parents, split, dataset = paras
    parents_pair_list = []
    mutual_info_list = []

    if split + num_parents - 1 < len(V):
        for other_parents in combinations(V[split + 1:], num_parents - 1):
            parents = list(other_parents)
            parents.append(V[split])
            parents_pair_list.append((child, parents))
            # TODO consider to change the computation of MI by combined integers instead of strings.
            mi = mutual_information(dataset[child], dataset[parents])
            mutual_info_list.append(mi)

    return parents_pair_list, mutual_info_list


def greedy_bayes(dataset: DataFrame, k: int, epsilon: float, seed=0):
#Construct a Bayesian Network (BN) using greedy algorithm.
# dataset : DataFrame
#INPUT DATASET SHOULD ONLY CONTAIN CATEGORICAL ATTRIBUTES.
# k : int
#Maximum degree of the constructed BN. If k=0, k is automatically calculated.
#epsilon : float
#Parameter of differential privacy.
#seed : int or float
#Seed for the randomness in BN generation.
    set_random_seed(seed)
    dataset: DataFrame = dataset.astype(str, copy=False)
    num_tuples, num_attributes = dataset.shape
    if not k:
        k = calculate_k(num_attributes, num_tuples)

    attr_to_is_binary = {attr: dataset[attr].unique().size <= 2 for attr in dataset}
    #a dictionary with key values as names of the attributes and the value indicates weather it is binary or not (no of values it has is 2 or not)

    print('================ Constructing Bayesian Network (BN) ================')
    root_attribute = random.choice(dataset.columns)
    V = [root_attribute]
    rest_attributes = list(dataset.columns)
    rest_attributes.remove(root_attribute)
    print(f'Adding ROOT {root_attribute}')
    N = []
    while rest_attributes:
        parents_pair_list = []
        mutual_info_list = []

        num_parents = min(len(V), k)
        tasks = [(child, V, num_parents, split, dataset) for child, split in
                 product(rest_attributes, range(len(V) - num_parents + 1))]
        with Pool() as pool:
            res_list = pool.map(worker, tasks)

        for res in res_list:
            parents_pair_list += res[0]
            mutual_info_list += res[1]

        if epsilon:
            sampling_distribution = exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary,
                                                          num_tuples, num_attributes)
            idx = np.random.choice(list(range(len(mutual_info_list))), p=sampling_distribution)
        else:
            idx = mutual_info_list.index(max(mutual_info_list))

        N.append(parents_pair_list[idx])
        adding_attribute = parents_pair_list[idx][0]
        V.append(adding_attribute)
        rest_attributes.remove(adding_attribute)
        print(f'Adding attribute {adding_attribute}')

    print('========================== BN constructed ==========================')

    return N
def display_bayesian_network(bn):
#to display bayesian network
    length = 0
    for child, _ in bn:
        if len(child) > length:
            length = len(child)

    print('Constructed Bayesian network:')
    for child, parents in bn:
        print("    {0:{width}} has parents {1}.".format(child, parents, width=length))


def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes):
#  Applied in Exponential Mechanism to sample outcomes
    delta_array = []
    for (child, parents) in parents_pair_list:
        sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary)
        delta = calculate_delta(num_attributes, sensitivity, epsilon)
        delta_array.append(delta)

    mi_array = np.array(mutual_info_list) / (2 * np.array(delta_array))
    mi_array = np.exp(mi_array)
    mi_array = normalize_given_distribution(mi_array)
    return mi_array


def laplace_noise_parameter(k, num_attributes, num_tuples, epsilon):
# The noises injected into conditional distributions.
#such that it satisfies the differential privacy.
# Note that these noises are over counts, instead of the probability distributions 
    return (num_attributes - k) / epsilon


def get_noisy_distribution_of_attributes(attributes, encoded_dataset, epsilon=0.1):
#it is to add noise to the attribute values 
#the encoded_dataset is the converted set of data(the categorical data is changed to numerical data)
    data = encoded_dataset.copy().loc[:, attributes]
    data['count'] = 1
    stats = data.groupby(attributes).sum()

    iterables = [range(int(encoded_dataset[attr].max()) + 1) for attr in attributes]
    products = product(*iterables)

    def grouper_it(iterable, n):
        while True:
            chunk_it = islice(iterable, n)
            try:
                first_el = next(chunk_it)
            except StopIteration:
                return
            yield chain((first_el,), chunk_it)

    full_space = None
    for item in grouper_it(products, 1000000):
        if full_space is None:
            full_space = DataFrame(columns=attributes, data=list(item))
        else:
            data_frame_append = DataFrame(columns=attributes, data=list(item))
            full_space = pd.concat([full_space, data_frame_append], ignore_index=True)

    stats.reset_index(inplace=True)
    stats = merge(full_space, stats, how='left')
    stats.fillna(0, inplace=True)

    if epsilon:
        k = len(attributes) - 1
        num_tuples, num_attributes = encoded_dataset.shape
        noise_para = laplace_noise_parameter(k, num_attributes, num_tuples, epsilon)
        laplace_noises = np.random.laplace(0, scale=noise_para, size=stats.index.size)
        stats['count'] += laplace_noises
        stats.loc[stats['count'] < 0, 'count'] = 0

    return stats


def construct_noisy_conditional_distributions(bayesian_network, encoded_dataset, epsilon=0.1):
#to add noise to the conditional distributions 
    k = len(bayesian_network[-1][1])
    conditional_distributions = {}

    # first k+1 attributes
    root = bayesian_network[0][1][0]
    kplus1_attributes = [root]
    for child, _ in bayesian_network[:k]:
        kplus1_attributes.append(child)

    noisy_dist_of_kplus1_attributes = get_noisy_distribution_of_attributes(kplus1_attributes, encoded_dataset, epsilon)

    # generate noisy distribution of root attribute.
    root_stats = noisy_dist_of_kplus1_attributes.loc[:, [root, 'count']].groupby(root).sum()['count']
    conditional_distributions[root] = normalize_given_distribution(root_stats).tolist()

    for idx, (child, parents) in enumerate(bayesian_network):
        conditional_distributions[child] = {}

        if idx <= k - 2:
            stats = noisy_dist_of_kplus1_attributes.copy().loc[:, parents + [child, 'count']]
            stats = stats.groupby(parents + [child], as_index=False).sum()
        elif idx == k - 1:
            stats = noisy_dist_of_kplus1_attributes.loc[:, parents + [child, 'count']]
        else:
            stats = get_noisy_distribution_of_attributes(parents + [child], encoded_dataset, epsilon)
            stats = stats.loc[:, parents + [child, 'count']]

        parents_grouper = parents[0] if len(parents) == 1 else parents
        for parents_instance, stats_sub in stats.groupby(parents_grouper):
            stats_sub = stats_sub.sort_values(by=child)
            dist = normalize_given_distribution(stats_sub['count']).tolist()

            parents_key = str([parents_instance]) if len(parents) == 1 else str(list(parents_instance))
            conditional_distributions[child][parents_key] = dist

    return conditional_distributions


In [None]:
class DataDescriber:
#Model input dataset, then save a description of the dataset into a json file
#Number of bins in histograms.
 #       If it is a string such as 'auto' or 'fd', calculate the optimal bin width by `numpy.histogram_bin_edges`.
  #  category_threshold : int
   #     Categorical variables have no more than "this number" of distinct values.
    #null_values: str or list
   #     Additional strings to recognize as missing values.
#    By default missing values already include {‘’, ‘NULL’, ‘N/A’, ‘NA’, ‘NaN’, ‘nan’}.
 #   attr_to_datatype : dict
 #       Dictionary of {attribute: datatype}, e.g., {"age": "Integer", "gender": "String"}.
 #   attr_to_is_categorical : dict
 #       Dictionary of {attribute: boolean}, e.g., {"gender":True, "age":False}.
 #   attr_to_is_candidate_key: dict
 #       Dictionary of {attribute: boolean}, e.g., {"id":True, "name":False}.
 #   data_description: dict
 #       Nested dictionary (equivalent to JSON) recording the mined dataset information.
 #   df_input : DataFrame
 #       The input dataset to be analyzed.
 #   attr_to_column : Dict
 #       Dictionary of {attribute: AbstractAttribute}
 #   bayesian_network : list
 #       List of [child, [parent,]] to represent a Bayesian Network.
 #   df_encoded : DataFrame
 #       Input dataset encoded into integers, taken as input by PrivBayes algorithm in correlated attribute mode.

    def __init__(self, histogram_bins: Union[int, str] = 20, category_threshold=20, null_values=None):
        self.histogram_bins: Union[int, str] = histogram_bins
        self.category_threshold: int = category_threshold
        self.null_values = null_values

        self.attr_to_datatype: Dict[str, DataType] = None
        self.attr_to_is_categorical: Dict[str, bool] = None
        self.attr_to_is_candidate_key: Dict[str, bool] = None

        self.data_description: Dict = {}
        self.df_input: DataFrame = None
        self.attr_to_column: Dict[str, AbstractAttribute] = None
        self.bayesian_network: List = None
        self.df_encoded: DataFrame = None

    def describe_dataset_in_random_mode(self,
                                        dataset_file: str,
                                        attribute_to_datatype: Dict[str, DataType] = None,
                                        attribute_to_is_categorical: Dict[str, bool] = None,
                                        attribute_to_is_candidate_key: Dict[str, bool] = None,
                                        categorical_attribute_domain_file: str = None,
                                        numerical_attribute_ranges: Dict[str, List] = None,
                                        seed=0):
        attribute_to_datatype = attribute_to_datatype or {}
        attribute_to_is_categorical = attribute_to_is_categorical or {}
        attribute_to_is_candidate_key = attribute_to_is_candidate_key or {}
        numerical_attribute_ranges = numerical_attribute_ranges or {}

        if categorical_attribute_domain_file:
            categorical_attribute_to_domain = utils.read_json_file(categorical_attribute_domain_file)
        else:
            categorical_attribute_to_domain = {}

        utils.set_random_seed(seed)
        self.attr_to_datatype = {attr: DataType(datatype) for attr, datatype in attribute_to_datatype.items()}
        self.attr_to_is_categorical = attribute_to_is_categorical
        self.attr_to_is_candidate_key = attribute_to_is_candidate_key
        self.read_dataset_from_csv(dataset_file)
        self.infer_attribute_data_types()
        self.analyze_dataset_meta()
        self.represent_input_dataset_by_columns()

        for column in self.attr_to_column.values():
            attr_name = column.name
            if attr_name in categorical_attribute_to_domain:
                column.infer_domain(categorical_domain=categorical_attribute_to_domain[attr_name])
            elif attr_name in numerical_attribute_ranges:
                column.infer_domain(numerical_range=numerical_attribute_ranges[attr_name])
            else:
                column.infer_domain()

        # record attribute information in json format
        self.data_description['attribute_description'] = {}
        for attr, column in self.attr_to_column.items():
            self.data_description['attribute_description'][attr] = column.to_json()

    def describe_dataset_in_independent_attribute_mode(self,
                                                       dataset_file,
                                                       epsilon=0.1,
                                                       attribute_to_datatype: Dict[str, DataType] = None,
                                                       attribute_to_is_categorical: Dict[str, bool] = None,
                                                       attribute_to_is_candidate_key: Dict[str, bool] = None,
                                                       categorical_attribute_domain_file: str = None,
                                                       numerical_attribute_ranges: Dict[str, List] = None,
                                                       seed=0):
        self.describe_dataset_in_random_mode(dataset_file,
                                             attribute_to_datatype,
                                             attribute_to_is_categorical,
                                             attribute_to_is_candidate_key,
                                             categorical_attribute_domain_file,
                                             numerical_attribute_ranges,
                                             seed=seed)

        for column in self.attr_to_column.values():
            column.infer_distribution()

        self.inject_laplace_noise_into_distribution_per_attribute(epsilon)
        # record attribute information in json format
        self.data_description['attribute_description'] = {}
        for attr, column in self.attr_to_column.items():
            self.data_description['attribute_description'][attr] = column.to_json()

    def describe_dataset_in_correlated_attribute_mode(self,
                                                      dataset_file,
                                                      k=0,
                                                      epsilon=0.1,
                                                      attribute_to_datatype: Dict[str, DataType] = None,
                                                      attribute_to_is_categorical: Dict[str, bool] = None,
                                                      attribute_to_is_candidate_key: Dict[str, bool] = None,
                                                      categorical_attribute_domain_file: str = None,
                                                      numerical_attribute_ranges: Dict[str, List] = None,
                                                      seed=0):
  
        
        self.describe_dataset_in_independent_attribute_mode(dataset_file,
                                                            epsilon,
                                                            attribute_to_datatype,
                                                            attribute_to_is_categorical,
                                                            attribute_to_is_candidate_key,
                                                            categorical_attribute_domain_file,
                                                            numerical_attribute_ranges,
                                                            seed)
        self.df_encoded = self.encode_dataset_into_binning_indices()
        if self.df_encoded.shape[1] < 2:
            raise Exception("Correlated Attribute Mode requires at least 2 attributes(i.e., columns) in dataset.")

        self.bayesian_network = greedy_bayes(self.df_encoded, k, epsilon / 2, seed=seed)
        self.data_description['bayesian_network'] = self.bayesian_network
        self.data_description['conditional_probabilities'] = construct_noisy_conditional_distributions(
            self.bayesian_network, self.df_encoded, epsilon / 2)

    def read_dataset_from_csv(self, file_name=None):
        try:
            self.df_input = read_csv(file_name, skipinitialspace=True, na_values=self.null_values)
        except (UnicodeDecodeError, NameError):
            self.df_input = read_csv(file_name, skipinitialspace=True, na_values=self.null_values,
                                     encoding='latin1')

        # Remove columns with empty active domain, i.e., all values are missing.
        attributes_before = set(self.df_input.columns)
        self.df_input.dropna(axis=1, how='all')
        attributes_after = set(self.df_input.columns)
        if len(attributes_before) > len(attributes_after):
            print(f'Empty columns are removed, including {attributes_before - attributes_after}.')

    def infer_attribute_data_types(self):
        attributes_with_unknown_datatype = set(self.df_input.columns) - set(self.attr_to_datatype)
        inferred_numerical_attributes = utils.infer_numerical_attributes_in_dataframe(self.df_input)

        for attr in attributes_with_unknown_datatype:
            column_dropna = self.df_input[attr].dropna()

            # current attribute is either Integer or Float.
            if attr in inferred_numerical_attributes:
                # TODO Comparing all values may be too slow for large datasets.
                if array_equal(column_dropna, column_dropna.astype(int, copy=False)):
                    self.attr_to_datatype[attr] = DataType.INTEGER
                else:
                    self.attr_to_datatype[attr] = DataType.FLOAT

            # current attribute is either String, DateTime, or SocialSecurityNumber.
            else:
                # Sample 20 values to test its data_type.
                samples = column_dropna.sample(20, replace=True)
                if all(samples.map(is_datetime)):
                    self.attr_to_datatype[attr] = DataType.DATETIME
                else:
                    if all(samples.map(is_ssn)):
                        self.attr_to_datatype[attr] = DataType.SOCIAL_SECURITY_NUMBER
                    else:
                        self.attr_to_datatype[attr] = DataType.STRING

    def analyze_dataset_meta(self):
        all_attributes = set(self.df_input.columns)

        # find all candidate keys.
        for attr in all_attributes - set(self.attr_to_is_candidate_key):
            if self.attr_to_datatype[attr] in {DataType.FLOAT, DataType.DATETIME}:
                self.attr_to_is_candidate_key[attr] = False
            else:
                self.attr_to_is_candidate_key[attr] = self.df_input[attr].is_unique

        candidate_keys = {attr for attr, is_key in self.attr_to_is_candidate_key.items() if is_key}

        # find all categorical attributes.
        for attr in all_attributes - set(self.attr_to_is_categorical):
            self.attr_to_is_categorical[attr] = self.is_categorical(attr)

        non_categorical_string_attributes = set()
        for attr, is_categorical in self.attr_to_is_categorical.items():
            if not is_categorical and self.attr_to_datatype[attr] is DataType.STRING:
                non_categorical_string_attributes.add(attr)

        attributes_in_BN = [attr for attr in self.df_input if
                            attr not in candidate_keys and attr not in non_categorical_string_attributes]
        non_categorical_string_attributes = list(non_categorical_string_attributes)

        self.data_description['meta'] = {"num_tuples": self.df_input.shape[0],
                                         "num_attributes": self.df_input.shape[1],
                                         "num_attributes_in_BN": len(attributes_in_BN),
                                         "all_attributes": self.df_input.columns.tolist(),
                                         "candidate_keys": list(candidate_keys),
                                         "non_categorical_string_attributes": non_categorical_string_attributes,
                                         "attributes_in_BN": attributes_in_BN}

    def is_categorical(self, attribute_name):
#Detect whether an attribute is categorical

        if attribute_name in self.attr_to_is_categorical:
            return self.attr_to_is_categorical[attribute_name]
        else:
            return self.df_input[attribute_name].dropna().unique().size <= self.category_threshold

    def represent_input_dataset_by_columns(self):
        self.attr_to_column = {}
        for attr in self.df_input:
            data_type = self.attr_to_datatype[attr]
            is_candidate_key = self.attr_to_is_candidate_key[attr]
            is_categorical = self.attr_to_is_categorical[attr]
            paras = (attr, is_candidate_key, is_categorical, self.histogram_bins, self.df_input[attr])
            if data_type is DataType.INTEGER:
                self.attr_to_column[attr] = IntegerAttribute(*paras)
            elif data_type is DataType.FLOAT:
                self.attr_to_column[attr] = FloatAttribute(*paras)
            elif data_type is DataType.DATETIME:
                self.attr_to_column[attr] = DateTimeAttribute(*paras)
            elif data_type is DataType.STRING:
                self.attr_to_column[attr] = StringAttribute(*paras)
            elif data_type is DataType.SOCIAL_SECURITY_NUMBER:
                self.attr_to_column[attr] = SocialSecurityNumberAttribute(*paras)
            else:
                raise Exception(f'The DataType of {attr} is unknown.')

    def inject_laplace_noise_into_distribution_per_attribute(self, epsilon=0.1):
        num_attributes_in_BN = self.data_description['meta']['num_attributes_in_BN']
        for column in self.attr_to_column.values():
            assert isinstance(column, AbstractAttribute)
            column.inject_laplace_noise(epsilon, num_attributes_in_BN)

    def encode_dataset_into_binning_indices(self):
 #       """Before constructing Bayesian network, encode input dataset into binning indices."""
        encoded_dataset = DataFrame()
        for attr in self.data_description['meta']['attributes_in_BN']:
            encoded_dataset[attr] = self.attr_to_column[attr].encode_values_into_bin_idx()
        return encoded_dataset

    def save_dataset_description_to_file(self, file_name):
        Path(file_name).touch()
        with open(file_name, 'w') as outfile:
            json.dump(self.data_description, outfile, indent=4)

    def display_dataset_description(self):
        print(json.dumps(self.data_description, indent=4))

This function takes the input data and preprocessing takes place here using the functoins defined above and it gets the info in the abstract attribute format and saves the description file in json form. 
This file in json form should used in the datagenerator.

In [None]:
class DataGenerator(object):
    def __init__(self):
        self.n = 0
        self.synthetic_dataset = None
        self.description = {}
        self.encoded_dataset = None

    def generate_dataset_in_random_mode(self, n, description_file, seed=0, minimum=0, maximum=100):
        set_random_seed(seed)
        description = read_json_file(description_file)

        self.synthetic_dataset = DataFrame()
        for attr in description['attribute_description'].keys():
            attr_info = description['attribute_description'][attr]
            datatype = attr_info['data_type']
            is_categorical = attr_info['is_categorical']
            is_candidate_key = attr_info['is_candidate_key']
            if is_candidate_key:
                self.synthetic_dataset[attr] = parse_json(attr_info).generate_values_as_candidate_key(n)
            elif is_categorical:
                self.synthetic_dataset[attr] = random.choice(attr_info['distribution_bins'], n)
            elif datatype == 'String':
                length = random.randint(attr_info['min'], attr_info['max'] + 1)
                self.synthetic_dataset[attr] = length
                self.synthetic_dataset[attr] = self.synthetic_dataset[attr].map(lambda x: generate_random_string(x))
            else:
                if datatype == 'Integer':
                    self.synthetic_dataset[attr] = random.randint(minimum, maximum + 1, n)
                else:
                    self.synthetic_dataset[attr] = random.uniform(minimum, maximum, n)

    def generate_dataset_in_independent_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)

    def generate_dataset_in_correlated_attribute_mode(self, n, description_file, seed=0):
        set_random_seed(seed)
        self.n = n
        self.description = read_json_file(description_file)

        all_attributes = self.description['meta']['all_attributes']
        candidate_keys = set(self.description['meta']['candidate_keys'])
        self.encoded_dataset = DataGenerator.generate_encoded_dataset(self.n, self.description)
        self.synthetic_dataset = DataFrame(columns=all_attributes)
        for attr in all_attributes:
            attr_info = self.description['attribute_description'][attr]
            column = parse_json(attr_info)

            if attr in self.encoded_dataset:
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(self.encoded_dataset[attr])
            elif attr in candidate_keys:
                self.synthetic_dataset[attr] = column.generate_values_as_candidate_key(n)
            else:
                # for attributes not in BN or candidate keys, use independent attribute mode.
                binning_indices = column.sample_binning_indices_in_independent_attribute_mode(n)
                self.synthetic_dataset[attr] = column.sample_values_from_binning_indices(binning_indices)

    @staticmethod
    def get_sampling_order(bn):
        order = [bn[0][1][0]]
        for child, _ in bn:
            order.append(child)
        return order

    @staticmethod
    def generate_encoded_dataset(n, description):
        bn = description['bayesian_network']
        bn_root_attr = bn[0][1][0]
        root_attr_dist = description['conditional_probabilities'][bn_root_attr]
        encoded_df = DataFrame(columns=DataGenerator.get_sampling_order(bn))
        encoded_df[bn_root_attr] = random.choice(len(root_attr_dist), size=n, p=root_attr_dist)

        for child, parents in bn:
            child_conditional_distributions = description['conditional_probabilities'][child]
            for parents_instance in child_conditional_distributions.keys():
                dist = child_conditional_distributions[parents_instance]
                parents_instance = list(eval(parents_instance))

                filter_condition = ''
                for parent, value in zip(parents, parents_instance):
                    filter_condition += f"(encoded_df['{parent}']=={value})&"

                filter_condition = eval(filter_condition[:-1])

                size = encoded_df[filter_condition].shape[0]
                if size:
                    encoded_df.loc[filter_condition, child] = random.choice(len(dist), size=size, p=dist)

            unconditioned_distribution = description['attribute_description'][child]['distribution_probabilities']
            encoded_df.loc[encoded_df[child].isnull(), child] = random.choice(len(unconditioned_distribution),
                                                                              size=encoded_df[child].isnull().sum(),
                                                                              p=unconditioned_distribution)
        encoded_df[encoded_df.columns] = encoded_df[encoded_df.columns].astype(int)
        return encoded_df

    def save_synthetic_data(self, to_file):
        Path(to_file).touch()
        self.synthetic_dataset.to_csv(to_file, index=False)


if __name__ == '__main__':
    from time import time

    dataset_description_file = '../out/AdultIncome/description_test.txt'
    generator = DataGenerator()

    t = time()
    generator.generate_dataset_in_correlated_attribute_mode(51, dataset_description_file)
    print('running time: {} s'.format(time() - t))
    print(generator.synthetic_dataset.loc[:50])