In [21]:
import sys
sys.path += ["../src"]
import utils
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib.pyplot import subplots as sbp 
from importlib import reload
import jl_vae
# import jl_nflows_geo_coordinates_2 as nfg
# from jl_nflows_geo_coordinates import load_nf as load_dict

from _51_abm_functions import cod_prov_abbrv_df

# Global Spatial Autocorrelation
from spatial_autocorrelation import get_moransI, moransI_scatterplot, hypothesis_testing
# Local Spatial Autocorrelation
from spatial_autocorrelation import get_localMoransI, LISA_scatterplot
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score


In [22]:
# transform one-hot encoding to categories
def add_cat_features(df):
    df["energy_class"] = df[[u for u in df.columns if "_energy" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]
    df["COD_CAT"] = [u[8:] for u in df[[u for u in df.columns if "COD_CAT_" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    df["anno_costruzione"] = [u[17:] for u in df[[u for u in df.columns if "ANNO_COSTRUZIONE" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    return df

## Loading data from Jacopo's synthetic data

In [23]:
# import dictionary with data {'hydro_risk', 'census', 'omi_og', 'cap'}
# takes ~25seconds
geo_dict = jl_vae.load_geo_data()

# check which provinces are done
glob(jl_vae.path_pop_synth + f"95sample/pop_samples/*")

['/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250806priceFE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703LT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703FI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703BT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250806priceKR.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703SI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CO.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250811priceMC.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples

In [24]:
import pickle

prov = "AN"

with open(f'/data/housing/data/intermediate/jl_pop_synth/isp_baselines/all_baselines_{prov}.pickle', 'rb') as f:
    all_baselines = pickle.load(f)

In [25]:
all_baselines.keys()

dict_keys(['df_real', 'df_real95', 'df_nfvae', 'df_nfvae95', 'df_ablation', 'df_ablation95', 'df_ipf', 'df_ipf95', 'df_copula_nf', 'df_copula_nf95', 'df_copula_ablation', 'df_copula_ablation95'])

In [26]:
all_baselines['df_real']

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,y,x,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,ANNO_COSTRUZIONE_1985_2005,...,floor_Missing,floor_plus_4,log_price,flag_air_conditioning_Missing,flag_multi_floor_Missing,SEZ2011,PRO_COM,CAP,OMI_id,prov_abbrv
0,1.0,0.0,0.0,0.0,43.615,13.528,4.605170,1.0,0.0,0.0,...,1.0,0.0,11.813030,1.0,1.0,420020000098,42002,60123,A271_B1,AN
1,0.0,0.0,0.0,0.0,43.437,13.609,4.304065,0.0,1.0,0.0,...,1.0,0.0,11.082143,1.0,1.0,420220000008,42022,60025,E690_D2,AN
2,0.0,0.0,0.0,0.0,43.439,13.610,5.293305,1.0,0.0,0.0,...,1.0,0.0,11.512925,1.0,1.0,420220000004,42022,60025,E690_D2,AN
3,1.0,0.0,0.0,0.0,43.658,13.149,4.634729,0.0,0.0,1.0,...,1.0,0.0,11.813030,1.0,1.0,420390000020,42039,60012,M318_E1,AN
4,0.0,0.0,0.0,0.0,43.615,13.519,4.532599,1.0,0.0,0.0,...,1.0,0.0,11.608236,1.0,1.0,420020000115,42002,60124,A271_B1,AN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4247,1.0,0.0,0.0,0.0,43.530,13.441,4.770685,0.0,0.0,1.0,...,1.0,0.0,11.982929,1.0,1.0,420330000001,42033,60020,G003_B1,AN
4248,1.0,0.0,0.0,0.0,43.532,13.401,5.068904,0.0,0.0,1.0,...,1.0,0.0,12.100712,1.0,1.0,420380000009,42038,60020,G803_B1,AN
4249,1.0,0.0,0.0,0.0,43.534,13.399,5.010635,0.0,0.0,0.0,...,1.0,0.0,12.427743,1.0,1.0,420380000009,42038,60020,G803_B1,AN
4250,0.0,0.0,0.0,0.0,43.530,13.441,4.955827,1.0,0.0,0.0,...,1.0,0.0,11.407565,0.0,1.0,420330000001,42033,60020,G003_B1,AN


In [12]:
baseline_path = "/data/housing/data/intermediate/jl_pop_synth/pop_samples/baselines"
cod_prov = cod_prov_abbrv_df.query("prov_abbrv == @prov")["COD_PROV"].iloc[0]

df_real = all_baselines['df_real'] 
df_real95 = all_baselines['df_real95']
df_excluded = df_real[~df_real.index.isin(df_real95.index)]

# nf + VAE
df_nfvae = all_baselines['df_nfvae95'] 
# ablation (only VAE), 95%
df_vae = all_baselines['df_ablation95']
# ipf, 95%
df_ipf95 = all_baselines['df_ipf95'] 
# copula + nf (95%)
df_copulanf = all_baselines['df_copula_nf95']
# copula (95%)
df_copula = all_baselines['df_copula_ablation95']


In [17]:
df_nfvae

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,ANNO_COSTRUZIONE_1985_2005,ANNO_COSTRUZIONE_2005_2025,ANNO_COSTRUZIONE_Missing,...,floor_plus_4,log_price,flag_air_conditioning_Missing,flag_multi_floor_Missing,y_latent,x_latent,x_norm,y_norm,x,y
0,False,False,False,False,4.177603,False,True,False,False,False,...,False,10.800657,True,True,0.535257,0.785437,0.645828,0.369737,13.507539,43.602300
1,True,False,False,False,4.963674,False,True,False,False,False,...,False,11.438527,True,True,0.416182,0.064057,-0.268751,-0.108266,13.121587,43.493317
2,True,False,False,False,5.575211,False,True,False,False,False,...,False,12.765796,True,True,0.717863,0.634779,0.638047,0.263971,13.504255,43.578186
3,True,False,False,False,4.784835,False,True,False,False,False,...,False,11.871620,True,True,0.210494,0.337177,0.188075,0.376966,13.314366,43.603947
4,True,False,False,False,4.839019,False,True,False,False,False,...,False,11.964004,True,True,0.520252,0.340509,0.294758,0.296095,13.359387,43.585510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,False,False,False,False,4.093862,True,False,False,False,False,...,False,10.704506,False,False,0.626999,0.232766,0.047623,0.020802,13.255096,43.522743
4035,False,True,False,False,5.728705,True,False,False,False,False,...,False,11.103680,False,True,0.753043,0.035899,-0.761900,-0.783288,12.913478,43.339410
4036,False,False,False,False,4.291572,True,False,False,False,False,...,False,11.574118,False,True,0.915435,0.994863,0.710781,0.404982,13.534949,43.610336
4037,True,False,False,False,4.612534,False,True,False,False,False,...,False,11.464518,True,True,0.096549,0.280079,-0.245482,0.581942,13.131406,43.650680


## fixing data types in syn pop data

In [18]:
def ConvertBool2number(df):
    bool_cols = df.select_dtypes(include=bool).columns
    df[bool_cols] = df[bool_cols].astype(float)
    df = df.reset_index(drop=True)
    return df

In [19]:
df_real95 = ConvertBool2number(df_real95)
df_excluded = ConvertBool2number(df_excluded)

# nf + VAE
df_nfvae = ConvertBool2number(df_nfvae)
# ablation (only VAE), 95%
df_vae = ConvertBool2number(df_vae)
# ipf, 95%
df_ipf95 = ConvertBool2number(df_ipf95)
# copula + nf (95%)
df_copulanf = ConvertBool2number(df_copulanf)
# copula (95%)
df_copula = ConvertBool2number(df_copula)

## Preparing dataframes such that they have all the same columns

In [20]:
col_to_drop = ['SEZ2011','PRO_COM','CAP','OMI_id','prov_abbrv']
# log_price is removed because nfvae does not have log_price column
df_real95 = df_real95.drop(columns=col_to_drop)
df_excluded = df_excluded.drop(columns=col_to_drop)

In [27]:
df_nfvae = df_nfvae.loc[:,df_nfvae.columns.isin(df_real95.columns)]
df_vae = df_vae.loc[:,df_vae.columns.isin(df_real95.columns)]
df_ipf95 = df_ipf95.loc[:,df_ipf95.columns.isin(df_real95.columns)]
df_copulanf = df_copulanf.loc[:,df_copulanf.columns.isin(df_real95.columns)]
df_copula = df_copula.loc[:,df_copula.columns.isin(df_real95.columns)]


## Creation of the training and testing sets

In [37]:
def Build_datasetsets(df_real,df_real_excluded,syn_data,train_size,test_size,frac=0.80,random_state = 42):
    df_real['label'] = 1
    df_real_excluded['label'] = 1
    syn_data['label'] = 0

    df_syn_test = syn_data.sample(n=test_size,random_state=random_state,replace = False)
    df_syn_training = syn_data.drop(df_syn_test.index).sample(n=train_size-test_size,random_state=random_state,replace = False)
    df_syn_test.reset_index(drop=True,inplace=True)
    df_syn_training.reset_index(drop=True,inplace=True)

    df_real_test = df_real_excluded.reset_index(drop=True)
    df_real_training = df_real.sample(n=train_size-test_size,random_state=random_state,replace = False).reset_index(drop=True)

    # getting 80% of syn homes and 20% of real homes
    # check for rpeated homes (avoid it)!!!
    df_train_syn = df_syn_training.sample(n=int(np.round(len(df_syn_training)*frac)),random_state=random_state).reset_index(drop=True)
    df_test_syn = df_syn_test.sample(n=int(np.round(len(df_syn_test)*frac)),random_state=random_state).reset_index(drop=True)

    df_train_real = df_real_training.sample(n=int(np.round(len(df_real_training)*(1-frac))),random_state=random_state).reset_index(drop=True)
    df_test_real = df_real_test.sample(n=int(np.round(len(df_real_test)*(1-frac))),random_state=random_state).reset_index(drop=True)

    df_training = pd.concat([df_train_syn,df_train_real])
    df_test = pd.concat([df_test_syn,df_test_real])

    return df_training, df_test


In [38]:
test_size = len(df_excluded)
train_size = np.min([len(df_real95),len(df_nfvae),len(df_copula),len(df_copulanf),len(df_ipf95),len(df_vae)])
f = 0.80

df_train_nfvae95, df_test_nfvae95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_nfvae,train_size = train_size, test_size = test_size,frac=f)
df_train_vae95, df_test_vae95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_vae,train_size = train_size, test_size = test_size,frac=f)
df_train_copula95, df_test_copula95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_copula,train_size = train_size, test_size = test_size,frac=f)
df_train_copula_nf95, df_test_copula_nf95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_copulanf,train_size = train_size, test_size = test_size,frac=f) 
df_train_ipf95, df_test_ipf95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_ipf95,train_size = train_size, test_size = test_size,frac=f)


## Classification

In [39]:
# function to prepare training and testing set
def Prepare_Xtrain_Xtest(df_training,df_test):
    scaler = StandardScaler()

    Xtrain = df_training.drop(columns='label')
    Ytrain = df_training.label

    Xtest = df_test.drop(columns='label')
    Ytest = df_test.label

    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)

    return Xtrain, Ytrain, Xtest, Ytest

In [42]:
Xtrain_nfvae95,Ytrain_nfvae95,Xtest_nfvae95,Ytest_nfvae95 = Prepare_Xtrain_Xtest(df_train_nfvae95, df_test_nfvae95)
Xtrain_vae95,Ytrain_vae95,Xtest_vae95,Ytest_vae95 = Prepare_Xtrain_Xtest(df_train_vae95, df_test_vae95)
Xtrain_copula95,Ytrain_copula95,Xtest_copula95,Ytest_copula95 = Prepare_Xtrain_Xtest(df_train_copula95, df_test_copula95)
Xtrain_copula_nf95,Ytrain_copula_nf95,Xtest_copula_nf95,Ytest_copula_nf95 = Prepare_Xtrain_Xtest(df_train_copula_nf95, df_test_copula_nf95)
Xtrain_ipf95,Ytrain_ipf95,Xtest_ipf95,Ytest_ipf95 = Prepare_Xtrain_Xtest(df_train_ipf95, df_test_ipf95)

In [43]:
# puttingn data inside dictionary
data_dict = {'ipf95':[Xtrain_ipf95,Ytrain_ipf95,Xtest_ipf95,Ytest_ipf95],
             'vae95':[Xtrain_vae95,Ytrain_vae95,Xtest_vae95,Ytest_vae95],
             'copula95':[Xtrain_copula95,Ytrain_copula95,Xtest_copula95,Ytest_copula95],
             'copula_nf95':[Xtrain_copula_nf95,Ytrain_copula_nf95,Xtest_copula_nf95,Ytest_copula_nf95],
             'nf_vae95':[Xtrain_nfvae95,Ytrain_nfvae95,Xtest_nfvae95,Ytest_nfvae95]}

In [45]:
def Privacy_Table(data_dict,metric='f1_score',random_state=42):
    
    # initialize result dataframe
    res = pd.DataFrame(columns=['model','ipf95','vae95','copula95','copula_nf95','nf_vae95'])

    # logistic regression
    res.loc[0,'model'] = 'Logistic Regression'
    for k in data_dict.keys():
        log_reg = LogisticRegression(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[0,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[0,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[0,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[0,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[0,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[0,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)

    # gaussian Naive Baise
    res.loc[1,'model'] = 'Gaussian Naive Bayes'
    for k in data_dict.keys():
        log_reg = GaussianNB()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[1,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[1,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[1,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[1,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[1,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[1,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
            
    # KNeighbours
    res.loc[2,'model'] = 'KNeighbours'
    for k in data_dict.keys():
        log_reg = KNeighborsClassifier()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[2,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[2,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[2,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[2,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[2,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[2,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Decision Tree
    res.loc[3,'model'] = 'Decision Tree'
    for k in data_dict.keys():
        log_reg = DecisionTreeClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[3,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[3,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[3,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[3,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[3,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[3,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Random Forest
    res.loc[4,'model'] = 'Random Forest'
    for k in data_dict.keys():
        log_reg = RandomForestClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[4,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[4,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[4,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[4,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[4,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[4,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    

    # Random Forest
    res.loc[5,'model'] = 'SVC'
    for k in data_dict.keys():
        log_reg = SVC()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[5,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[5,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[5,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[5,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[5,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[5,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    

    # MLP classifier
    res.loc[6,'model'] = 'MLP'
    for k in data_dict.keys():
        log_reg = MLPClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[6,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[6,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[6,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[6,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[6,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[6,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Majority classifier
    res.loc[7,'model'] = 'Majority classifier'
    for k in data_dict.keys():
        y_pred = np.zeros(shape=(len(data_dict[k][3]),1))
        if(metric == 'f1_score'):
            res.loc[7,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[7,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[7,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[7,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[7,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[7,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Random classifier
    np.random.seed(random_state)
    res.loc[8,'model'] = 'Random classifier'
    for k in data_dict.keys():
        y_pred = np.random.rand(len(data_dict[k][3]))
        y_pred[y_pred>=0.5]=1
        y_pred[y_pred<0.5]=0
        if(metric == 'f1_score'):
            res.loc[8,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[8,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[8,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[8,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[8,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[8,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    


    return res



In [153]:
res_aur_pr = Privacy_Table(data_dict=data_dict,metric='AUC-PR')
res_aur_pr

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.35,0.2,0.22,0.39
1,Gaussian Naive Bayes,0.23,0.2,0.2,0.39
2,KNeighbours,0.48,0.55,0.42,0.57
3,Decision Tree,0.87,0.37,0.48,0.43
4,Random Forest,0.78,0.53,0.46,0.62
5,SVC,0.39,0.39,0.31,0.63
6,MLP,0.55,0.7,0.55,0.64
7,Majority classifier,0.2,0.2,0.2,0.2
8,Random classifier,0.22,0.21,0.18,0.19


In [46]:
res_f1 = Privacy_Table(data_dict=data_dict)
res_f1

Unnamed: 0,model,ipf95,vae95,copula95,copula_nf95,nf_vae95
0,Logistic Regression,0.0,0.39,0.0,0.0,0.49
1,Gaussian Naive Bayes,0.31,0.24,0.33,0.31,0.49
2,KNeighbours,0.16,0.71,0.44,0.42,0.77
3,Decision Tree,0.7,0.77,0.57,0.43,0.71
4,Random Forest,0.38,0.78,0.63,0.3,0.9
5,SVC,0.0,0.61,0.2,0.23,0.74
6,MLP,0.16,0.74,0.56,0.41,0.91
7,Majority classifier,0.0,0.0,0.0,0.0,0.0
8,Random classifier,0.34,0.31,0.21,0.26,0.34


In [47]:
res_precision = Privacy_Table(data_dict=data_dict,metric='precision')
res_precision

Unnamed: 0,model,ipf95,vae95,copula95,copula_nf95,nf_vae95
0,Logistic Regression,0.0,0.79,0.0,0.0,1.0
1,Gaussian Naive Bayes,0.19,0.86,0.2,0.19,1.0
2,KNeighbours,0.28,0.87,0.53,0.54,0.91
3,Decision Tree,0.69,0.8,0.54,0.44,0.68
4,Random Forest,0.73,0.97,0.81,0.8,0.95
5,SVC,0.0,0.87,0.62,0.67,1.0
6,MLP,0.57,0.85,0.71,0.62,0.91
7,Majority classifier,0.0,0.0,0.0,0.0,0.0
8,Random classifier,0.24,0.22,0.15,0.18,0.24


In [48]:
res_recall = Privacy_Table(data_dict=data_dict,metric='recall')
res_recall

Unnamed: 0,model,ipf95,vae95,copula95,copula_nf95,nf_vae95
0,Logistic Regression,0.0,0.26,0.0,0.0,0.33
1,Gaussian Naive Bayes,0.91,0.14,0.95,0.91,0.33
2,KNeighbours,0.12,0.6,0.37,0.35,0.67
3,Decision Tree,0.72,0.74,0.6,0.42,0.74
4,Random Forest,0.26,0.65,0.51,0.19,0.86
5,SVC,0.0,0.47,0.12,0.14,0.58
6,MLP,0.09,0.65,0.47,0.3,0.91
7,Majority classifier,0.0,0.0,0.0,0.0,0.0
8,Random classifier,0.58,0.58,0.37,0.44,0.58


In [157]:
res_accuracy = Privacy_Table(data_dict=data_dict,metric='accuracy')
res_accuracy

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.84,0.8,0.8,0.85
1,Gaussian Naive Bayes,0.47,0.21,0.2,0.85
2,KNeighbours,0.87,0.88,0.8,0.89
3,Decision Tree,0.97,0.8,0.86,0.82
4,Random Forest,0.94,0.88,0.86,0.91
5,SVC,0.85,0.85,0.83,0.91
6,MLP,0.89,0.92,0.88,0.91
7,Majority classifier,0.8,0.8,0.8,0.8
8,Random classifier,0.54,0.49,0.45,0.48
