## Sanitization over Income Dataset

The income data set can be downloaded here: [dataset](https://archive.ics.uci.edu/ml/datasets/census+income)

In [8]:
from sanitization_tools import *
import math
import re
income_dataset_path = "census_level_0.csv"
model_dict = dict()
model_dict["linear_regression"] = linear_model.LinearRegression()
model_dict["svm"] = svm.SVC(gamma=0.001, C=100.)
model_dict["naive_bayes"] = naive_bayes.GaussianNB()
model_dict["tree"] = tree.DecisionTreeRegressor()
cases = list()
true_prob = None
for pr in range(1,11): # si lo llevamos hasta 16 cubrimos de forma correcta otro par de columnas
        cases += [[pr, False, True, True, true_prob, False],
                  [pr, True, True, True, true_prob, False],
                  [pr, False, True, True, true_prob, True],
                  [pr, True, False, False, true_prob, False],
                  [pr, False, False, False, true_prob, False],
                  [pr, False, False, False, true_prob, True]]

processed_cases = list()
case_model_scores = dict()
reco_list = list()
for case in cases:
    case_name = str(case[0])+("m" if case[5] else "t" if case[1] else "f") +\
                ("t" if case[2] else "f")+("t" if case[3] else "f")+(str(case[4]) if (case[1] and not case[2]) else "")
    if case_name not in processed_cases:
        data = pn.read_csv(income_dataset_path)

        data_cols = data.columns
        cat_columns = [u'workclass', u'education', u'marital-status', u'occupation',
                   u'race', u'sex', u'native-country']

        oh = preprocessing.OneHotEncoder()
        le = preprocessing.LabelEncoder()
        all_columns = ["age"]
        case2 = case
        for col in cat_columns:
            rel_privacy = math.ceil(float(case[0])/10*len(data[col].unique()))
            case[0] = rel_privacy
            cis = pn.DataFrame.from_dict(Counter(data[col]), "index").reset_index()
            cis.columns = ["class", "CIS"]
            field_dict = operator_model(data[col], *case2)
            real_col = data[col]
            data.drop(col, axis=1, inplace=True)
            nis_rmse = dict()
            for field in field_dict.keys():
                field_name = "_".join([col, field])
                data.loc[:, field_name] = field_dict[field]
                rmse = ((real_col == field) - field_dict[field]).map(lambda x: x*x).sum()
                nis_rmse[field] = rmse
                if field_name not in all_columns:
                    all_columns += [field_name]
            nis = pn.DataFrame.from_dict(data.loc[:, [col+"_"+x for x in field_dict.keys()]].sum().to_dict(), "index").reset_index()
            nis = pn.DataFrame.from_dict(nis_rmse, "index").reset_index()
            nis.columns = ["class", "NIS"]

            tmp_df = cis.merge(nis, how="left")
            tmp_df["column"] = col
            tmp_df["case"] = case_name
            reco_list.append(tmp_df)
            
        std_cols = ["age"]

        std_scaler = preprocessing.StandardScaler()
        for col in std_cols:
            data.loc[:, col] = std_scaler.fit_transform(data[col].reshape(-1,1))

        data_sanitized = data[all_columns + ["salary-class"]]
        # data.to_csv("../data/hist_python/sanitized_census_"+case_name+".csv")
        # apply a suppervised algorithm
        case_model_scores[case_name] = dict()
        print(case_name)
        for model_name, model in model_dict.items():
            case_model_scores[case_name][model_name] = get_auc_score_of_model(data_sanitized, model)
        processed_cases.append(case_name)

reco_df = pn.concat(reco_list)
reco_df.to_csv("supervised_df.csv")

df_models_scores = pn.DataFrame.from_dict(case_model_scores, orient="index").reset_index().rename(columns={"index":"case"})
df_models_scores = df_models_scores.melt(id_vars=["case"]).rename(columns={"variable":"model"})

df_models_scores["privacy"] = df_models_scores["case"].map(lambda x: re.findall("\d+", x)[0])
df_models_scores["real"] = df_models_scores["case"].map(lambda x: re.findall("[^\d]",x)[0])
df_models_scores["uniform"] = df_models_scores["case"].map(lambda x: int(re.findall("[^\d]",x)[1] == "t"))
df_models_scores["uniform2"] = df_models_scores["case"].map(lambda x: int(re.findall("[^\d]",x)[2] == "t"))

df_models_scores["error"] = df_models_scores["value"].map(lambda x: x[0])
df_models_scores["auc"] = df_models_scores["value"].map(lambda x: x[1])
    
df_models_scores["roc_x"] = df_models_scores["value"].map(lambda x: all_entries_vector(x[2][0]))
df_models_scores["roc_y"] = df_models_scores["value"].map(lambda x: all_entries_vector(x[2][1]))
df_models = df_models_scores[["case", "model", "privacy", "real", "uniform", "uniform2", "error", "auc", "roc_x", "roc_y"]]
df_models.columns = [["case", "model", "privacy", "real", "uniform", "uniform2", "error", "auc", "roc_x", "roc_y"]]
df_models.to_csv("model_scores_roc.csv")



1ftt


## Sanitization over Simulated Dataset

In [None]:
import multiprocessing
from sanitization_tools import *
import math
import re
column_size=1000
nsim_case = 10
cases = list()
for nclasses in range(2, 30)[::1]:
    for true_prob in [None]:
        for pr in range(1, 11):
            for class_dist in ['uniform','exponential']:
                for nsim in range(nsim_case):
                    cases += [[pr, nclasses, class_dist, False, True, True, true_prob, False],
                              [pr, nclasses, class_dist, True, True, True, true_prob, False],
                              [pr, nclasses, class_dist, False, True, True, true_prob, True],
                              [pr, nclasses, class_dist, True, False, False, true_prob, False],
                              [pr, nclasses, class_dist, False, False, False, true_prob, False],
                              [pr, nclasses, class_dist, False, False, False, true_prob, True]]
n=0
processed_cases = list()
reco_df = pn.DataFrame(columns=["case", "class", "CIS", "NIS"])
rmse_by_case = dict()
def process_case(case_t):
    case = case_t[0]
    cases = case_t[1]
    case_name = str(case[0])+("m" if case[7] else ("t" if case[3] else "f")) + ("t" if case[4] else "f") + \
                ("t" if case[5] else "f") #+ (str(case[6]) if (case[3] and not case[4]) else "")
    case_name += '_' + str(case[1]) + '_' + str(case[2])
    nclasses = case[1]
    class_dist = case[2]
    print(class_dist)
    p = [1./nclasses]*nclasses if class_dist == 'uniform' else expo_weights(nclasses)
    sim_data = np.random.choice(range(nclasses), column_size, p=p)
    #if case_name not in processed_cases:
    cis = pn.DataFrame.from_dict(Counter(sim_data), "index").reset_index()
    cis.columns = ["class", "CIS"]
    case2 = case
    case2.pop(1)
    case2.pop(1)
    rel_privacy = math.ceil(float(case[0])/10*nclasses)
    case2[0] = rel_privacy
    field_dict = operator_model(sim_data, *case2)
    #print(field_dict)
    nis = pn.DataFrame.from_dict(field_dict).sum(axis=0).reset_index()
    nis.columns = ["class", "NIS"]
    tmp_df = cis.merge(nis, how="left")
    tmp_df['RMSE'] = (tmp_df['CIS'] - tmp_df['NIS']).map(lambda x: x*x)
    tmp_df['CHI'] = (tmp_df['RMSE']/tmp_df['CIS'].map(lambda x: x if x>0.0 else np.nan))
    rmse_one = math.sqrt(sum(tmp_df['RMSE'].values))
    chi_one = np.nansum(tmp_df['CHI'].values)
    return (case_name, rmse_one, chi_one)


pool = multiprocessing.Pool()
cases_results = pool.map(process_case, [(case,cases) for case in cases])

rmse_df = pn.DataFrame(cases_results)
rmse_df.columns = ["case", "rmse", "chi"]
rmse_df["privacy"] = rmse_df["case"].map(lambda x: re.findall("\d+", x)[0])
rmse_df["real"] = rmse_df["case"].map(lambda x: re.findall("[^\d]",x)[0])
rmse_df["uniform"] = rmse_df["case"].map(lambda x: int(re.findall("[^\d]",x)[1] == "t"))
rmse_df["uniform2"] = rmse_df["case"].map(lambda x: int(re.findall("[^\d]",x)[2] == "t"))
rmse_df["nclasses"] = rmse_df["case"].map(lambda x: re.findall("\d+", x)[1])
rmse_df["uniform_original"] = rmse_df["case"].map(lambda x: int(x.split("_")[-1] == "uniform"))
rmse_df.to_csv("df_simulated_rel.csv")