---
title: Sanitization Process for Data Privacy
authors:
- Juan Zinser 
tags:
- distributed
- data
- privacy
created_at: 2018-01-30
updated_at: 2018-01-30
tldr: This is short description of the content and findings of the post.
---

As good as data can be now a days, it must satisfy several requirements for it to be made public. A natural trade-off arises between privacy and utility of a dataset. On one side, depending on regulations and the data-holder carefulness, data satisfies certain privacy concerns to prevent sensitive information from being revealed. On the other side, for inference and conclusions to be taken from a dataset, data should be available for people whose interest is to analyze it. These analysis rely on data quality, and the more, the better for it's users. Work has been done to make sure data follows the corresponding privacy constraints, by generalization, suppression or sanitization techniques, with the aim of making data less informative (more private). The purpose of this work is to explore new ways of sanitization for databases and to measure their performance.

Motivation
Why is it important to make data public?

Data privacy exists because making data public is important, and it has to be taken care of before making it public. Open-Data is a term that refers to the action of Public or Private Institutions making their Data Public, and usually it helps to improve public policy and public services. An open data culture enhaces collaboration, participation and social innovation European Data Portal and Janssen, Marijin et al.

## Analyse the Supervised Set

In [1]:
from dask import dataframe, delayed
from dask.distributed import Client
import os
import math
import pandas as pd
import numpy as np
income_dataset_path = "census_level_0.csv"
#client = Client("0.0.0.0:8786")
client = Client()
# read data
data = dataframe.read_csv(income_dataset_path)

data_cols = data.columns
cat_columns = [u'workclass', u'education', u'marital-status', u'occupation', u'race', u'sex', u'native-country']

In [2]:
privacy = 0.3
include_real = True 
uniform = True
uniform2 = True
maybe = False

def check_include_real(include_real, privacy, class_length):
    if (privacy - include_real) >= class_length:
        include_real = True
    return include_real


def expo_weights(nclasses):
    weights = list()
    curr_weight = 1.
    for i in range(nclasses):
        curr_weight /= 2.
        weights.append(curr_weight)
    return weights_to_probabilities(weights)


def weights_to_probabilities(weights_vector, sum_to=1.):
    if sum(weights_vector) > 0:
        return np.array([sum_to * float(i) / sum(weights_vector) for
                         i in weights_vector])
    else:
        return weights_vector


def entry_sanitization(entry, real_prob, class_length,
                       maybe, uniform, uniform2, include_real,
                       privacy, order_weights, key_to_order,
                       order_exception, ordered_weights, counter):
    """
    Sanitizes a single record

    :param entry:
    :param real_prob:
    :param class_length:
    :param maybe:
    :param uniform:
    :param uniform2:
    :param include_real:
    :param privacy:
    :param order_weights:
    :param key_to_order:
    :param order_exception:
    :param ordered_weights:
    :return:
    """
    # initializes the entry_vector, same size as the
    # total number of classes
    privacy_fraction = 1. / privacy
    entry_vector = np.zeros(class_length)
    #print(entry)
    if not maybe:
        # gets the weights of each class excluding the real
        # value class
        weights = [1. / (class_length - 1)] * \
                  (class_length - 1) if uniform else \
            order_weights[key_to_order[entry]]

        # makes the weights sum one
        weights = weights_to_probabilities(weights)

        # get sample of the indexes that will have a
        # non zero weight (real not considered)
        non_real_weights = np.random.choice(
            order_exception[key_to_order[entry]],
            privacy - include_real, False, p=weights)

        # save the corresponding weights into their
        # corresponding index for all the sampled
        # indexes in the previous step
        entry_vector[non_real_weights] = privacy_fraction if \
            uniform2 else [ordered_weights[i] for
                           i in non_real_weights]

        # if real prob is None set to the proportional weight
        real_prob = ordered_weights[key_to_order[entry]] if \
            real_prob is None else real_prob

        # gets the weight that will be assigned to
        # the real value
        real_value = (privacy_fraction if uniform2 else
                      real_prob) if include_real else 0
        entry_vector = weights_to_probabilities(
            entry_vector, 1 - real_value)

        entry_vector[key_to_order[entry]] = real_value
        entry_vector = weights_to_probabilities(entry_vector)
    else:
        # gets the weights of each class excluding the
        # real value class
        weights = [1. / class_length] * class_length if \
            uniform else ordered_weights

        # get sample of the indexes that will have a non
        # zero weight
        selected_weights = np.random.choice(
            list(range(class_length)), privacy,
            False, p=weights)

        # save the corresponding weights into their
        # corresponding index
        # for all the sampled indexes in the previous step
        entry_vector[selected_weights] = privacy_fraction if \
            uniform2 else [ordered_weights[i]
                           for i in selected_weights]
        entry_vector = weights_to_probabilities(entry_vector)

    return entry_vector


def get_owoe(class_length, key_to_order, ordered_weights):
    
    order_exception = dict()
    order_weights = dict()

    # gets two dictionaries, order exception and ordered weights
    for key in range(class_length):
        all_non_entry = list(range(class_length))
        all_non_entry.pop(key)
        all_non_entry_ordered_weights = [ordered_weights[i] for
                                         i in all_non_entry]

        # order exception has a list off all the indexes
        # other than the one of the real value, after
        # being ordered
        order_exception[key] = all_non_entry
        # order weights contains the equivalent to order
        # exception but with the corresponding weights instead
        order_weights[key] = all_non_entry_ordered_weights
        
    return order_exception, order_weights

meta_data = client.gather(client.compute({col: {"privacy": math.ceil(privacy*len(data[col].unique())),
                                                 "counter": data[col].value_counts()} for col in cat_columns}))
meta_data = {col:{"privacy":min(val["privacy"],len(val["counter"])), 
                  "class_length":len(val["counter"]),
                  "counter":val["counter"].to_dict()}
             for col, val in meta_data.items()}

meta_info = {"df":{"n":len(data)},
             "columns":meta_data,
            "algorithm":{"uniform":uniform,
                        "uniform2":uniform2,
                        "real_prob":None,
                        "maybe":maybe}}

# the meta info should include de get_owoe() information
for col, col_info in meta_info["columns"].items():
    meta_info["columns"][col]["include_real"] = check_include_real(include_real, col_info["privacy"], col_info["class_length"])
    key_to_order =  dict(zip(sorted(col_info["counter"].keys()), range(col_info["class_length"])))
    ordered_weights = [float(col_info["counter"][key]) / meta_info["df"]["n"] for key in sorted(col_info["counter"].keys())]
    meta_info["columns"][col]["ordered_weights"] = ordered_weights
    meta_info["columns"][col]["key_to_order"] = key_to_order
    order_exception, order_weights = get_owoe(col_info["class_length"], key_to_order, ordered_weights)
    meta_info["columns"][col]["order_exception"] = order_exception
    meta_info["columns"][col]["order_weights"] = order_weights

In [46]:
asd = {}
for col in cat_columns:
    print(col)
    #print(meta_info["columns"][col])
    asd[col] = client.gather(client.compute(data[col].map(lambda x: entry_sanitization(entry=x, **meta_info["algorithm"], **meta_info["columns"][col]), meta=('x', str))))
    #asd[col] = data[col].map(lambda x: entry_sanitization(entry=x, **meta_info["algorithm"], **meta_info["columns"][col]), meta=('x', str))


workclass
education
marital-status
occupation
race
sex
native-country


In [11]:
meta_info["columns"]["education"]["key_to_order"]

{'10th': 0,
 '11th': 1,
 '12th': 2,
 '1st-4th': 3,
 '5th-6th': 4,
 '7th-8th': 5,
 '9th': 6,
 'Assoc-acdm': 7,
 'Assoc-voc': 8,
 'Bachelors': 9,
 'Doctorate': 10,
 'HS-grad': 11,
 'Masters': 12,
 'Preschool': 13,
 'Prof-school': 14,
 'Some-college': 15}

In [89]:
asd_df =  pd.concat([pd.DataFrame([i for i in v.values],columns=[k+"/"+i for i in meta_info["columns"][k]["key_to_order"].keys()]) for k, v in asd.items()], axis=1)
asd_df["y"] = (data["salary-class"]==" >50K").astype(int)

asd_df = dataframe.from_pandas(asd_df,4)

In [90]:
asd_df

Unnamed: 0_level_0,workclass/Never-worked,workclass/Private,workclass/Local-gov,workclass/?,workclass/Self-emp-inc,workclass/Without-pay,workclass/Federal-gov,workclass/State-gov,workclass/Self-emp-not-inc,sex/Male,sex/Female,native-country/England,native-country/Germany,native-country/Dominican-Republic,native-country/Laos,native-country/Guatemala,native-country/Nicaragua,native-country/Poland,native-country/Ecuador,native-country/Cambodia,native-country/Italy,native-country/India,native-country/Mexico,native-country/Columbia,native-country/Hungary,native-country/Thailand,native-country/Japan,native-country/China,native-country/Ireland,native-country/Yugoslavia,native-country/Scotland,native-country/Holand-Netherlands,native-country/Peru,native-country/Portugal,native-country/United-States,native-country/Puerto-Rico,native-country/?,native-country/El-Salvador,native-country/Philippines,native-country/Hong,native-country/Iran,native-country/Honduras,native-country/Trinadad&Tobago,native-country/Haiti,native-country/Jamaica,native-country/Vietnam,native-country/France,native-country/Canada,native-country/Greece,native-country/Cuba,native-country/Outlying-US(Guam-USVI-etc),native-country/Taiwan,native-country/South,education/HS-grad,education/Doctorate,education/5th-6th,education/1st-4th,education/Assoc-acdm,education/Assoc-voc,education/Bachelors,education/11th,education/7th-8th,education/10th,education/9th,education/Preschool,education/Masters,education/12th,education/Some-college,education/Prof-school,occupation/Farming-fishing,occupation/Armed-Forces,occupation/Adm-clerical,occupation/Machine-op-inspct,occupation/Transport-moving,occupation/Handlers-cleaners,occupation/?,occupation/Priv-house-serv,occupation/Exec-managerial,occupation/Prof-specialty,occupation/Craft-repair,occupation/Other-service,occupation/Tech-support,occupation/Protective-serv,occupation/Sales,marital-status/Married-AF-spouse,marital-status/Widowed,marital-status/Married-civ-spouse,marital-status/Divorced,marital-status/Married-spouse-absent,marital-status/Never-married,marital-status/Separated,race/Black,race/Asian-Pac-Islander,race/Amer-Indian-Eskimo,race/Other,race/White,y
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1
0,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64
8141,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16282,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24423,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32560,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [93]:
# define x and y data
data_x = asd_df[[col for col in asd_df.columns if col!="y"]]
data_y = asd_df["y"]

In [None]:
import dask_ml


In [96]:
from dask_ml.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(data_x, data_y)

TypeError: 

In [109]:
from dask_ml.xgboost import XGBRegressor

est = XGBRegressor()
est.fit(data_x, data_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [110]:
predictions = est.predict(data_x)

In [116]:
corrects = ((predictions>.5).astype(int)==data_y).astype(int).compute()
error = sum(corrects)/len(corrects)

In [123]:
from sklearn.metrics import roc_curve, roc_auc_score
rocc = roc_curve(data_y, predictions)
aucc = roc_auc_score(data_y, predictions)

In [124]:
aucc

0.7902269169319514

In [117]:
print(error)

0.7771260096434385


# areas of opportunitty
+ get the different modes
+ train the models with hyperparameter tunning and correct dataset splitting (cross validation)

In [100]:
data_x.compute()

Unnamed: 0,workclass/Never-worked,workclass/Private,workclass/Local-gov,workclass/?,workclass/Self-emp-inc,workclass/Without-pay,workclass/Federal-gov,workclass/State-gov,workclass/Self-emp-not-inc,sex/Male,...,marital-status/Married-civ-spouse,marital-status/Divorced,marital-status/Married-spouse-absent,marital-status/Never-married,marital-status/Separated,race/Black,race/Asian-Pac-Islander,race/Amer-Indian-Eskimo,race/Other,race/White
0,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.333333,0.333333,0.000000,0.0,...,0.333333,0.000000,0.333333,0.000000,0.333333,0.5,0.0,0.0,0.0,0.5
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.333333,0.333333,0.0,...,0.333333,0.333333,0.000000,0.000000,0.000000,0.5,0.0,0.0,0.0,0.5
2,0.000000,0.000000,0.000000,0.333333,0.333333,0.000000,0.333333,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.333333,0.000000,0.5,0.0,0.0,0.0,0.5
3,0.000000,0.000000,0.333333,0.000000,0.333333,0.000000,0.333333,0.000000,0.000000,0.0,...,0.333333,0.000000,0.000000,0.333333,0.333333,0.5,0.0,0.5,0.0,0.0
4,0.000000,0.333333,0.000000,0.000000,0.333333,0.000000,0.333333,0.000000,0.000000,1.0,...,0.333333,0.333333,0.000000,0.000000,0.000000,0.0,0.5,0.5,0.0,0.0
5,0.333333,0.000000,0.000000,0.000000,0.333333,0.000000,0.333333,0.000000,0.000000,1.0,...,0.333333,0.000000,0.000000,0.333333,0.000000,0.0,0.5,0.0,0.0,0.5
6,0.333333,0.333333,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,1.0,...,0.000000,0.333333,0.000000,0.000000,0.333333,0.0,0.5,0.5,0.0,0.0
7,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,0.333333,0.000000,0.0,...,0.333333,0.000000,0.000000,0.000000,0.333333,0.0,0.0,0.5,0.0,0.5
8,0.000000,0.333333,0.000000,0.000000,0.333333,0.333333,0.000000,0.000000,0.000000,1.0,...,0.000000,0.333333,0.333333,0.333333,0.000000,0.5,0.0,0.0,0.0,0.5
9,0.000000,0.000000,0.000000,0.333333,0.333333,0.000000,0.333333,0.000000,0.000000,0.0,...,0.333333,0.000000,0.333333,0.333333,0.000000,0.5,0.0,0.0,0.0,0.5


In [87]:
asas[1]

' >50K'

In [4]:
for col in cat_columns:
    print(asd[col][0].shape[0]==len(data[col].unique()))


True
True
True
True
True
True
True


In [65]:
?pd.concat

[0;31mSignature:[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0mobjs[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mjoin[0m[0;34m=[0m[0;34m'outer'[0m[0;34m,[0m [0mjoin_axes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mignore_index[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mkeys[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlevels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverify_integrity[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0msort[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Concatenate pandas objects along a particular axis with optional set logic
along the other axes.

Can also add a layer of hierarchical indexing on the concatenation axis,
which may be useful if the labels are the same (or overlapping) on
the passed axis number.

Parameters
----------
objs : a sequence or mapping of S

In [64]:
asd["education"].map(str)

0        [0.  0.2 0.  0.  0.  0.  0.2 0.2 0.  0.2 0.  0...
1        [0.  0.  0.2 0.  0.  0.  0.  0.  0.  0.2 0.2 0...
2        [0.  0.2 0.  0.  0.  0.  0.  0.  0.  0.2 0.2 0...
3        [0.  0.2 0.  0.2 0.  0.2 0.  0.2 0.  0.  0.  0...
4        [0.  0.2 0.  0.  0.2 0.  0.  0.  0.  0.2 0.2 0...
5        [0.  0.  0.2 0.  0.  0.2 0.  0.2 0.  0.  0.  0...
6        [0.  0.2 0.  0.  0.  0.  0.2 0.  0.2 0.2 0.  0...
7        [0.2 0.  0.2 0.  0.  0.  0.2 0.  0.  0.2 0.  0...
8        [0.  0.  0.2 0.2 0.  0.  0.  0.  0.  0.2 0.  0...
9        [0.2 0.  0.  0.  0.  0.  0.  0.2 0.  0.2 0.  0...
10       [0.  0.  0.  0.2 0.  0.2 0.2 0.  0.  0.  0.2 0...
11       [0.  0.  0.  0.  0.2 0.  0.  0.  0.2 0.2 0.  0...
12       [0.  0.  0.  0.  0.  0.2 0.  0.2 0.  0.2 0.  0...
13       [0.  0.2 0.  0.  0.  0.2 0.2 0.2 0.2 0.  0.  0...
14       [0.  0.  0.  0.  0.  0.  0.  0.  0.2 0.2 0.2 0...
15       [0.  0.  0.  0.  0.2 0.2 0.  0.  0.  0.  0.2 0...
16       [0.2 0.2 0.2 0.  0.  0.  0.  0.  0.2 0.  0.  0.

In [6]:
data[col].compute()

0        United-States
1        United-States
2        United-States
3        United-States
4                 Cuba
5        United-States
6              Jamaica
7        United-States
8        United-States
9        United-States
10       United-States
11               India
12       United-States
13       United-States
14                   ?
15              Mexico
16       United-States
17       United-States
18       United-States
19       United-States
20       United-States
21       United-States
22       United-States
23       United-States
24       United-States
25       United-States
26       United-States
27               South
28       United-States
29       United-States
             ...      
32531    United-States
32532    United-States
32533            Japan
32534    United-States
32535    United-States
32536    United-States
32537    United-States
32538    United-States
32539    United-States
32540    United-States
32541    United-States
32542    United-States
32543    Un

In [6]:
for pr in range(1,11): # si lo llevamos hasta 16 cubrimos de forma correcta otro par de columnas
        cases += [[pr, False, True, True, true_prob, False],
                  [pr, True, True, True, true_prob, False],
                  [pr, False, True, True, true_prob, True],
                  [pr, True, False, False, true_prob, False],
                  [pr, False, False, False, true_prob, False],
                  [pr, False, False, False, true_prob, True]]

processed_cases = list()
case_model_scores = dict()
reco_list = list()
for case in cases:
    case_name = str(case[0])+("m" if case[5] else "t" if case[1] else "f") +\
                ("t" if case[2] else "f")+("t" if case[3] else "f")+(str(case[4]) if (case[1] and not case[2]) else "")
    if case_name not in processed_cases:
        data = pn.read_csv(income_dataset_path)

        data_cols = data.columns
        cat_columns = [u'workclass', u'education', u'marital-status', u'occupation',
                   u'race', u'sex', u'native-country']

        oh = preprocessing.OneHotEncoder()
        le = preprocessing.LabelEncoder()
        all_columns = ["age"]
        case2 = case
        for col in cat_columns:
            rel_privacy = math.ceil(float(case[0])/10*len(data[col].unique()))
            case[0] = rel_privacy
            cis = pn.DataFrame.from_dict(Counter(data[col]), "index").reset_index()
            cis.columns = ["class", "CIS"]
            field_dict = operator_model(data[col], *case2)
            real_col = data[col]
            data.drop(col, axis=1, inplace=True)
            nis_rmse = dict()
            for field in field_dict.keys():
                field_name = "_".join([col, field])
                data.loc[:, field_name] = field_dict[field]
                rmse = ((real_col == field) - field_dict[field]).map(lambda x: x*x).sum()
                nis_rmse[field] = rmse
                if field_name not in all_columns:
                    all_columns += [field_name]
            nis = pn.DataFrame.from_dict(data.loc[:, [col+"_"+x for x in field_dict.keys()]].sum().to_dict(), "index").reset_index()
            nis = pn.DataFrame.from_dict(nis_rmse, "index").reset_index()
            nis.columns = ["class", "NIS"]

            tmp_df = cis.merge(nis, how="left")
            tmp_df["column"] = col
            tmp_df["case"] = case_name
            reco_list.append(tmp_df)
            
        std_cols = ["age"]

        std_scaler = preprocessing.StandardScaler()
        for col in std_cols:
            data.loc[:, col] = std_scaler.fit_transform(data[col].reshape(-1,1))

        data_sanitized = data[all_columns + ["salary-class"]]
        data.to_csv("../data/hist_python/sanitized_census_"+case_name+".csv")
        # apply a suppervised algorithm
        case_model_scores[case_name] = dict()
        print(case_name)
        #for model_name, model in model_dict.items():
        #    case_model_scores[case_name][model_name] = get_auc_score_of_model(data_sanitized, model)
        processed_cases.append(case_name)

NameError: name 'cases' is not defined

In [None]:
reco_df = pn.concat(reco_list)
reco_df.to_csv("supervised_rmse_df.csv")

# since the RMSE matters independently of the supervised taggs it is better to analyse the 
# RMSE in the non supervised case since there is more control of the number of classes.
# construct a dataframe from the scores dictionary
df_models_scores = pn.DataFrame.from_dict(case_model_scores, orient="index").reset_index().rename(columns={"index":"case"})
df_models_scores = df_models_scores.melt( value_vars=df_models_scores.columns, value_name="models")
df_models_scores["error"] = df_models_scores["models"].map(lambda x: x[0])
df_models_scores["auc"] = df_models_scores["models"].map(lambda x: x[1])
df_models_scores["roc"] = df_models_scores["models"].map(lambda x: x[2])
df_models = df_models_scores[["case", "variable", "error", "auc", "roc"]]
df_models.columns = [["case", "model", "error", "auc", "roc"]]
df_models.to_csv("model_scores_roc.csv")

In [None]:
from sanitization_tools import *
supervised_results = pn.read_csv("model_scores_roc.csv")
rocs_by_case(supervised_results, {},{"real":["t","f","m"]}, savefig=True, title="by IF REAL", save_name="income_roc_privacy_grouped_tmf",language="spanish")
rocs_by_case(supervised_results, {"uniform":1, "uniform2":1},{"real":["t","f","m"]}, savefig=True, title="by Privacy Uniform Weights", save_name="income_roc_privacy_grouped_tmf_1", language="spanish")
rocs_by_case(supervised_results, {"uniform":0, "uniform2":0},{"real":["t","f","m"]}, savefig=True, title="by Privacy Proportional Weights", save_name="income_roc_privacy_grouped_tmf_0", language="spanish")
rocs_by_case(supervised_results, {"real":"t", "uniform":1, "uniform2":1},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy Real Unifrom", save_name="income_roc_privacyt1", language="spanish")
rocs_by_case(supervised_results, {"real":"t", "uniform":0, "uniform2":0},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy Real Proportional", save_name="income_roc_privacyt0", language="spanish")
rocs_by_case(supervised_results, {"real":"f", "uniform":1, "uniform2":1},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy NonReal Uniform", save_name="income_roc_privacyf1", language="spanish")
rocs_by_case(supervised_results, {"real":"f", "uniform":0, "uniform2":0},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy NonReal Proportional", save_name="income_roc_privacyf0", language="spanish")
rocs_by_case(supervised_results, {"real":"m", "uniform":1, "uniform2":1},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy MaybeReal Uniform", save_name="income_roc_privacym1", language="spanish")
rocs_by_case(supervised_results, {"real":"m", "uniform":0, "uniform2":0},{"privacy":[i for i in range(1,11)]}, savefig=True, title="by Privacy MaybeReal Proportional", save_name="income_roc_privacym0", language="spanish")

print(supervised_results.columns)

In [None]:
plot_intervals(supervised_results, "privacy","auc", {}, 
               {"real":["t", "f", "m"]}, savefig=True, 
               title="AUC Privacy by Real", save_name="auc_real_privacy")

plot_intervals(supervised_results, "privacy","auc", {}, 
               {"uniform":0,1]}, savefig=True, 
               title="AUC Privacy by Uniform", save_name="auc_uniform_privacy")

In [None]:
from sanitization_tools import *
supervised_results = pn.read_csv("model_scores_roc.csv")

rocs_by_case(supervised_results, {"model": "tree"},
                {"privacy":[i for i in range(1,11)]}, savefig=False, title="by Privacy TREE")
rocs_by_case(supervised_results, {"model": "svm"},
                {"privacy":[i for i in range(1,11)]}, savefig=False, title="by Privacy SVM")
rocs_by_case(supervised_results, {"model": "linear_regression"},
                {"privacy":[i for i in range(1,11)]}, savefig=False, title="by Privacy Linear Regression")
rocs_by_case(supervised_results, {"model": "naive_bayes"},
                {"privacy":[i for i in range(1,11)]}, savefig=False, title="by Privacy Naive Bayes")

In [None]:
rmse_auc_plot_no_intervals(supervised_results, "privacy", "auc", ["t", "f", "m"], [None], [None], [None], [None],
                           {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="Supervised AUC for Real and Uniform", save_name="supervised_auc_gb_tmf_01")

rmse_auc_plot_no_intervals(supervised_results, "privacy", "auc", [None], [None], [None], [None], ["svm", "linear_regression", "tree", "naive_bayes"],
                           savefig=True, title="Supervised AUC for Real and Uniform", save_name="supervised_auc_models")

### AUC Table

In [None]:
auc_sum = supervised_results.groupby(["privacy", "real"])["auc"].mean().reset_index()
auc_pivsum = auc_sum.pivot(index="privacy", columns="real", values="auc" ).round(2)
print(auc_pivsum.to_latex())

auc_sum = supervised_results[(supervised_results.uniform==1) & (supervised_results.uniform2==1)].groupby(["privacy", "real"])["auc"].mean().reset_index()
auc_pivsum = auc_sum.pivot(index="privacy", columns="real", values="auc" ).round(2)
print(auc_pivsum.to_latex())

auc_sum = supervised_results[(supervised_results.uniform==0) & (supervised_results.uniform2==0)].groupby(["privacy", "real"])["auc"].mean().reset_index()
auc_pivsum = auc_sum.pivot(index="privacy", columns="real", values="auc" ).round(2)
print(auc_pivsum.to_latex())

## Model Comparison

In [None]:
plot_intervals(supervised_results, "privacy","auc", {}, 
               {"model":["svm", "linear_regression", "tree", "naive_bayes"]}, savefig=True, 
               title="AUC Privacy by Model", save_name="auc_model_privacy")

plot_intervals(supervised_results, "privacy","auc", {"uniform": [0], "uniform2":[0], "uniform_original":[0]}, 
               {"model":["svm", "linear_regression", "tree", "naive_bayes"]}, savefig=False, 
               title="Non Uniform and Exponential Original", save_name="auc_nclasses00")

In [None]:
plot_intervals(supervised_results, "real", "auc", {}, 
               {"model":["svm", "linear_regression", "tree", "naive_bayes"]}, savefig=False, 
               title="Non Uniform and Exponential Original", save_name="auc_isreal")

### Bar plots

In [None]:
from sanitization_tools import *
supervised_results = pn.read_csv("model_scores_roc.csv")

df = supervised_results
gb_param = "real"
yaxis = "auc"
base_filter = {}
lines_cases = {"model":["svm", "linear_regression", "tree", "naive_bayes"]}
savefig=True
title="Models by if 'real' is included"
save_name="include_real_model_wr"
language="english"
width_delta=.1

fig, ax = plt.subplots()
pt = base_filter.get("privacy")
if pt is not None:
    base_filter.pop("privacy")
    df = df.query("privacy < {pt}".format(pt=pt))
if "uniform" in gb_param:
    df = df[df.uniform == df.uniform2]

df = get_base_filtered_df(df, base_filter)
dfc = get_single_filter_df(df, "privacy", 1)
dfc = get_single_filter_df(dfc, "uniform", 1)
dfc = get_single_filter_df(dfc, "real", "t")
gb0 = dfc.groupby(["model"])[yaxis].mean()
print(gb0)
df = get_base_filtered_df(df, base_filter)
ps = list()
labels = list()
width = 0
scatter_x = list()
scatter_y = list()
if len(lines_cases)>0:
    for k, v in lines_cases.items():
        v = [v] if not isinstance(v, list) else v
        for v0 in v:
            dfc = get_single_filter_df(df, k, v0)

            gb = dfc.groupby([gb_param])[yaxis].mean().reset_index()
            gb2 = dfc.groupby([gb_param])[yaxis].std().reset_index()

            x = gb[gb_param].unique()
            print(gb)
            print(x)
            ind = np.arange(len(x))
            curr_p = ax.bar(ind + width, gb[yaxis], width_delta, color=np.random.rand(3,),
                            bottom=0, yerr=gb2[yaxis])
            scatter_y.extend([gb0.loc[v0]]*3)
            scatter_x.extend(ind+width)
            ps.append(curr_p)
            param_dict = {k: v0}
            tt = get_label_name(param_dict, True, language)
            labels.append(tt)
            width += width_delta
else:
    gb = df.groupby([gb_param])[yaxis].mean().reset_index()
    gb2 = df.groupby([gb_param])[yaxis].std().reset_index()

    x = gb[gb_param].unique()
    ind = np.arange(len(x))
    curr_p = ax.bar(ind+width, gb[yaxis], width_delta, color=np.random.rand(3,),
                    bottom=0, yerr=gb2[yaxis])
    ps.append(curr_p)
    tt = get_label_name(base_filter, True, language)
    labels.append(tt)
    width += width_delta

plt.scatter(scatter_x, scatter_y, color="k", s=100)
ax.set_title(title)
ax.set_xticks(ind + width_delta / 2)
ax.set_ylabel(yaxis)
x = label_rename(x, language)
ax.set_xticklabels(x, rotation=45, ha="right")
ax.legend([list(p)[0] for p in ps], labels)

dict_use = english_dict if language == "english" else spanish_dict
gb_param = dict_use.get(gb_param.lower()) if dict_use.get(gb_param.lower()) else gb_param
yaxis = dict_use.get(yaxis.lower()) if dict_use.get(yaxis.lower()) else yaxis
ax.set_xlabel(gb_param.upper())
ax.set_ylabel(yaxis.upper())
plt.tight_layout()
if savefig:
    plt.savefig(figures_path + save_name + ".png")
plt.show()

plot_bars(supervised_results, "real", "auc", {}, 
               {"model":["svm", "linear_regression", "tree", "naive_bayes"]}, savefig=True, 
               title="Models by if 'real' is included", save_name="include_real_model", width_delta=.1)

plot_bars(supervised_results, "uniform", "auc", {}, 
               {"model":["svm", "linear_regression", "tree", "naive_bayes"]}, savefig=True, 
               title="Models by if sanitization distribution", save_name="uniform_model", width_delta=.1)

In [None]:
from sanitization_tools import *
supervised_results = pn.read_csv("model_scores_roc.csv")

df = supervised_results
gb_param = "real"
yaxis = "auc"
base_filter = {}
lines_cases = {"real":["t","f","m"]}
savefig=True
title="Is  Real"
save_name="privacy_is_real"
width_delta=.1


fig, ax = plt.subplots()
pt = base_filter.get("privacy")
if pt is not None:
    base_filter.pop("privacy")
    df = df.query("privacy < {pt}".format(pt=pt))
df = df[df.uniform == df.uniform2]
df = get_base_filtered_df(df, base_filter)
ps = list()
labels = list()
width = 0
xticks = list()
xticks_locs = list()
colors = {"t":"b","f":"r","m":"g"}
if len(lines_cases)>0:
    for k, v in lines_cases.items():
        v = [v] if not isinstance(v, list) else v
        for v0 in v:
            dfc = get_single_filter_df(df, k, v0)

            gb = dfc.groupby([gb_param])[yaxis].mean().reset_index()
            gb2 = dfc.groupby([gb_param])[yaxis].std().reset_index()

            x = gb[gb_param].unique()
            xticks.extend(x)
            ind = np.arange(len(x))
            xticks_locs.append(ind+width)
            curr_p = ax.bar(ind + width, gb[yaxis], width_delta, color=colors[x[0]],
                            bottom=0, yerr=gb2[yaxis])
            ps.append(curr_p)
            param_dict = {k: v0}
            tt = get_label_name(param_dict, True, "spanish")
            labels.append(tt)
            width += width_delta

ax.set_title(title)
#ax.set_xticks(ind + width_delta / 2)
ax.set_xticks(xticks_locs)
ax.set_ylabel(yaxis)
xticks = label_rename(xticks, "spanish")
ax.set_xticklabels(xticks, rotation = 45, ha="right")
ax.legend([p[0] for p in ps], labels)

ax.set_xlabel(gb_param.upper())
ax.set_ylabel(yaxis.upper())
plt.tight_layout()
if savefig:
    plt.savefig(figures_path + save_name + ".png")
plt.show()
    

    
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"t"}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, Include Real", 
                       save_name="privacy_auc_bar_t", width_delta=.1, language="english")
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"f"}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, No Real", 
                       save_name="privacy_auc_bar_f", width_delta=.1, language="english")
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"t", "uniform":0}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, Include Real \n Proportional", 
                       save_name="privacy_auc_bar_t0", width_delta=.1, language="english")
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"t", "uniform":1}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, Include Real \n Uniform", 
                       save_name="privacy_auc_bar_t1", width_delta=.1, language="english")
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"f", "uniform":0}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, Include Real \n Proportional", 
                       save_name="privacy_auc_bar_f0", width_delta=.1, language="english")
plot_bars_single_chunk(df = supervised_results, gb_param = "privacy",yaxis = "auc", base_filter = {"real":"f", "uniform":1}, 
                       lines_cases = {"privacy":[str(i) for i in range(1,11)]}, savefig=True, title="AUC by Privacy, Include Real \n Uniform", 
                       save_name="privacy_auc_bar_f1", width_delta=.1, language="english")


## Analyse the Non-Supervised Set

In [None]:
from sanitization_tools import *
non_supervised_results = pn.read_csv("rmse_df_simulated_rel.csv")

rmse_auc_plot_no_intervals(non_supervised_results, "privacy", "rmse", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Privacy (R,U,UO)", save_name="rmse_privacy")
rmse_auc_plot_no_intervals(non_supervised_results, "nclasses", "rmse", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Number Classes (R,U,UO)", save_name="rmse_nclasses")
rmse_auc_plot_no_intervals(non_supervised_results, "privacy", "chi", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Privacy (R,U,UO)", save_name="privacy")
rmse_auc_plot_no_intervals(non_supervised_results, "nclasses", "chi", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Number Classes (R,U,UO)", save_name="nclasses")



In [None]:
from sanitization_tools import *
non_supervised_results = pn.read_csv("rmse_df_simulated_rel.csv")

rmse_auc_plot_with_intervals(non_supervised_results, "privacy", "chi", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Privacy (R,U,UO)", save_name="privacy")
rmse_auc_plot_with_intervals(non_supervised_results, "nclasses", "chi", 
                           ["t", "m", "f"], [None], [None], [0,1], [None],
                            {("uniform","uniform2"):[(1,1),(0,0)]}, savefig=True, 
                           title="All Cases Number Classes (R,U,UO)", save_name="nclasses")


In [None]:
from sanitization_tools import *
non_supervised_results = pn.read_csv("rmse_df_simulated_rel.csv")

plot_intervals(non_supervised_results, "nclasses","rmse", {"uniform": [0], "uniform2":[0], "uniform_original":[0]}, 
               {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Exponential Original", save_name="rmse_nclasses00")
plot_intervals(non_supervised_results, "nclasses","rmse",{"uniform": [0], "uniform2":[0], "uniform_original":[1]},
               {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Uniform Original", save_name="rmse_nclasses01")
plot_intervals(non_supervised_results, "nclasses","rmse", {"uniform": [1], "uniform2":[1], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Uniform and Exponential Original", save_name="rmse_nclasses10")
plot_intervals(non_supervised_results, "nclasses","rmse", {"uniform": [1], "uniform2":[1], "uniform_original":[1]},  {"real":["t", "m", "f"]},
               savefig=True, title="Uniform and Uniform Original", save_name="rmse_nclasses11")


plot_intervals(non_supervised_results, "privacy","rmse", {"uniform": [0], "uniform2":[0], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Exponential Original", save_name="rmse_privacy00")
plot_intervals(non_supervised_results, "privacy","rmse",{"uniform": [0], "uniform2":[0], "uniform_original":[1]},
                {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Uniform Original", save_name="rmse_privacy01")
plot_intervals(non_supervised_results, "privacy","rmse", {"uniform": [1], "uniform2":[1], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Uniform and Exponential Original", save_name="rmse_privacy10")
plot_intervals(non_supervised_results, "privacy","rmse", {"uniform": [1], "uniform2":[1], "uniform_original":[1]}, 
                {"real":["t", "m", "f"]},savefig=True, title="Uniform and Uniform Original", save_name="rmse_privacy11")

In [None]:
from sanitization_tools import *
non_supervised_results = pn.read_csv("rmse_df_simulated_rel.csv")

plot_intervals(non_supervised_results, "nclasses","chi", {"uniform": [0], "uniform2":[0], "uniform_original":[0]}, 
               {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Exponential Original", save_name="nclasses00")
plot_intervals(non_supervised_results, "nclasses","chi",{"uniform": [0], "uniform2":[0], "uniform_original":[1]},
               {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Uniform Original", save_name="nclasses01")
plot_intervals(non_supervised_results, "nclasses","chi", {"uniform": [1], "uniform2":[1], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Uniform and Exponential Original", save_name="nclasses10")
plot_intervals(non_supervised_results, "nclasses","chi", {"uniform": [1], "uniform2":[1], "uniform_original":[1]},  {"real":["t", "m", "f"]},
               savefig=True, title="Uniform and Uniform Original", save_name="nclasses11")


plot_intervals(non_supervised_results, "privacy","chi", {"uniform": [0], "uniform2":[0], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Exponential Original", save_name="privacy00")
plot_intervals(non_supervised_results, "privacy","chi",{"uniform": [0], "uniform2":[0], "uniform_original":[1]},
                {"real":["t", "m", "f"]}, savefig=True, title="Non Uniform and Uniform Original", save_name="privacy01")
plot_intervals(non_supervised_results, "privacy","chi", {"uniform": [1], "uniform2":[1], "uniform_original":[0]},
                {"real":["t", "m", "f"]}, savefig=True, title="Uniform and Exponential Original", save_name="privacy10")
plot_intervals(non_supervised_results, "privacy","chi", {"uniform": [1], "uniform2":[1], "uniform_original":[1]}, 
                {"real":["t", "m", "f"]},savefig=True, title="Uniform and Uniform Original", save_name="privacy11")

In [None]:
plot_intervals_std(non_supervised_results, "privacy","chi", {},
                {"real":["t", "m", "f"]}, savefig=True, title="Privacy Considering if is Real", save_name="privacy_isreal")
plot_intervals_std(non_supervised_results, "privacy","chi", {},
                {"uniform":[0,1]}, savefig=True, title="Privacy Considering if Uniform", save_name="privacy_uniform")
plot_intervals_std(non_supervised_results, "privacy","chi", {"uniform_original":[1]},
                {"uniform":[0,1]}, savefig=True, title="Privacy Considering if is Sanitization Distribution \n and Uniform Original", save_name="privacy_uniform_original1")
plot_intervals_std(non_supervised_results, "privacy","chi", {"uniform_original":[0]},
                {"uniform":[0,1]}, savefig=True, title="Privacy Considering if is Sanitization Distribution \n and Exponential Original", save_name="privacy_uniform_original0")

plot_intervals_std(non_supervised_results, "nclasses","chi", {},
                {"real":["t", "m", "f"]}, savefig=True, title="Number of Classes Considering if is Real", save_name="nclasses_isreal")
plot_intervals_std(non_supervised_results, "nclasses","chi", {},
                {"uniform":[0,1]}, savefig=True, title="Number of Classes Considering if Uniform", save_name="nclasses_uniform")
plot_intervals_std(non_supervised_results, "nclasses","chi", {"uniform_original":[1]},
                {"uniform":[0,1]}, savefig=True, title="Number of Classes Considering if is Sanitization Distribution \n and Uniform Original", save_name="nclassses_uniform_original1")
plot_intervals_std(non_supervised_results, "nclasses","chi", {"uniform_original":[0]},
                {"uniform":[0,1]}, savefig=True, title="Number of Classes Considering if is Original Distribution \n and Non Exponential Original", save_name="nclasses_uniform_original0")
