In [136]:
import sys
sys.path += ["../src"]
import utils
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib.pyplot import subplots as sbp 
from importlib import reload
import jl_vae
# import jl_nflows_geo_coordinates_2 as nfg
# from jl_nflows_geo_coordinates import load_nf as load_dict

from _51_abm_functions import cod_prov_abbrv_df

# Global Spatial Autocorrelation
from spatial_autocorrelation import get_moransI, moransI_scatterplot, hypothesis_testing
# Local Spatial Autocorrelation
from spatial_autocorrelation import get_localMoransI, LISA_scatterplot
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score


In [137]:
# transform one-hot encoding to categories
def add_cat_features(df):
    df["energy_class"] = df[[u for u in df.columns if "_energy" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]
    df["COD_CAT"] = [u[8:] for u in df[[u for u in df.columns if "COD_CAT_" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    df["anno_costruzione"] = [u[17:] for u in df[[u for u in df.columns if "ANNO_COSTRUZIONE" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    return df

## Loading data from Jacopo's synthetic data

In [138]:
# import dictionary with data {'hydro_risk', 'census', 'omi_og', 'cap'}
# takes ~25seconds
geo_dict = jl_vae.load_geo_data()

# check which provinces are done
glob(jl_vae.path_pop_synth + f"95sample/pop_samples/*")

['/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250806priceFE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703LT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703FI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703BT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250806priceKR.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703SI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CO.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250811priceMC.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples

In [139]:
# paths of the synthetic and real populations

# choose the province
prov = "AN" # AN
# get the cod prov (number)
cod_prov = cod_prov_abbrv_df.query("prov_abbrv == @prov")["COD_PROV"].item()

# all trained models have these settings
date_nf = "241203"
date_vae = "240107"
date_vae = "250709price_"
date_95 = "250703"
date_data = "250110"

real_pops = jl_vae.path_pop_synth + f"pop_samples/pop_real_with_hedonic_price"
# synth_pops = jl_vae.path_pop_synth + f"pop_samples/pop_synth_with_hedonic_price_250110"
synth_pops = jl_vae.path_pop_synth + f"pop_samples/"

vae_data = "full"

synth_pops95 = jl_vae.path_pop_synth + f"95sample/pop_samples"

In [140]:
baseline_path = "/data/housing/data/intermediate/jl_pop_synth/pop_samples/baselines"
cod_prov = cod_prov_abbrv_df.query("prov_abbrv == @prov")["COD_PROV"].iloc[0]

df_real = pd.read_csv(jl_vae.path_pop_synth + f"real_populations/df_real_{prov}.csv", index_col = 0)
df_real95 = df_real.sample(frac = 0.95, random_state = 1111)
df_excluded = df_real[~df_real.index.isin(df_real95.index)]

# nf + VAE
df_nfvae = pd.read_csv(baseline_path + f"/df_nfvae_{prov}.csv", index_col = 0)
# ablation, only VAE
df_vae = pd.read_csv(baseline_path + f"/df_vae_{prov}.csv", index_col = 0)
# nf + VAE with 95% sample
df_nfvae95 = pd.read_csv(baseline_path + f"/df_nfvae95_{prov}.csv", index_col = 0)
# copula 
df_copula_ablation = pd.read_csv(baseline_path + f"/df_copula_ablation_{prov}.csv", index_col = 0)
# nf + copula 
df_copula_nf = pd.read_csv(baseline_path + f"/df_copula_nf_{prov}.csv", index_col = 0)
# copula with 95% 
df_copula_ablation95 = pd.read_csv(baseline_path + f"/df_copula_ablation95_{prov}.csv", index_col = 0)
# nf + copula 95%
df_copula_nf95 = pd.read_csv(baseline_path + f"/df_copula_nf95_{prov}.csv", index_col = 0)
# ipf
df_ipf = pd.read_csv(baseline_path + f"/df_ipf_{prov}.csv", index_col = 0)
# ipf, 95%
df_ipf95 = pd.read_csv(baseline_path + f"/df_ipf95_{prov}.csv", index_col = 0)

## List of dataframe to use

df_real95 (training, real data)

df_excluded (testing, real data)

Syn data

df_nfvae95 (Normalizing flows + VAE) 

df_copula_ablation95 (ablation)
 
df_copula_nf95 (ablation + NF)

df_ipf95 (IPF)


## fixing data types in syn pop data

In [141]:
bool_cols = df_nfvae95.select_dtypes(include=bool).columns
df_nfvae95[bool_cols] = df_nfvae95[bool_cols].astype(float)
df_nfvae95 = df_nfvae95.reset_index(drop=True)

In [142]:
bool_cols = df_copula_ablation95.select_dtypes(include=bool).columns
df_copula_ablation95[bool_cols] = df_copula_ablation95[bool_cols].astype(float)
df_copula_ablation95 = df_copula_ablation95.reset_index(drop=True)

In [143]:
bool_cols = df_copula_nf95.select_dtypes(include=bool).columns
df_copula_nf95[bool_cols] = df_copula_nf95[bool_cols].astype(float)
df_copula_nf95 = df_copula_nf95.reset_index(drop=True)

In [144]:
bool_cols = df_ipf95.select_dtypes(include=bool).columns
df_ipf95[bool_cols] = df_ipf95[bool_cols].astype(float)
df_ipf95 = df_ipf95.reset_index(drop=True)

## Preparing dataframes such that they have all the same columns

In [145]:
col_to_drop = ['SEZ2011','PRO_COM','CAP','OMI_id','prov_abbrv','log_price']
# log_price is removed because nfvae does not have log_price column
df_real95 = df_real95.drop(columns=col_to_drop)
df_excluded = df_excluded.drop(columns=col_to_drop)

In [146]:
df_nfvae95 = df_nfvae95.loc[:,df_nfvae95.columns.isin(df_real95.columns)]
df_copula_ablation95 = df_copula_ablation95.loc[:,df_copula_ablation95.columns.isin(df_real95.columns)]
df_copula_nf95 = df_copula_nf95.loc[:,df_copula_nf95.columns.isin(df_real95.columns)]
df_ipf95 = df_ipf95.loc[:,df_ipf95.columns.isin(df_real95.columns)]


## Creation of the training and testing sets

In [147]:
def Build_datasetsets(df_real,df_real_excluded,syn_data,train_size,test_size,frac=0.80,random_state = 42):
    df_real['label'] = 1
    df_real_excluded['label'] = 1
    syn_data['label'] = 0

    df_syn_test = syn_data.sample(n=test_size,random_state=random_state,replace = False)
    df_syn_training = syn_data.drop(df_syn_test.index).sample(n=train_size-test_size,random_state=random_state,replace = False)
    df_syn_test.reset_index(drop=True,inplace=True)
    df_syn_training.reset_index(drop=True,inplace=True)

    df_real_test = df_real_excluded.reset_index(drop=True)
    df_real_training = df_real.sample(n=train_size-test_size,random_state=random_state,replace = False).reset_index(drop=True)

    # getting 80% of syn homes and 20% of real homes
    # check for rpeated homes (avoid it)!!!
    df_train_syn = df_syn_training.sample(n=int(np.round(len(df_syn_training)*frac)),random_state=random_state).reset_index(drop=True)
    df_test_syn = df_syn_test.sample(n=int(np.round(len(df_syn_test)*frac)),random_state=random_state).reset_index(drop=True)

    df_train_real = df_real_training.sample(n=int(np.round(len(df_real_training)*(1-frac))),random_state=random_state).reset_index(drop=True)
    df_test_real = df_real_test.sample(n=int(np.round(len(df_real_test)*(1-frac))),random_state=random_state).reset_index(drop=True)

    df_training = pd.concat([df_train_syn,df_train_real])
    df_test = pd.concat([df_test_syn,df_test_real])

    return df_training, df_test


In [148]:
test_size = len(df_excluded)
train_size = np.min([len(df_real95),len(df_nfvae95),len(df_copula_ablation95),len(df_copula_nf95),len(df_ipf95)])
f = 0.80

df_train_nfvae95, df_test_nfvae95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_nfvae95,train_size = train_size, test_size = test_size,frac=f)
df_train_copulaablation95, df_test_copulaablation95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_copula_ablation95,train_size = train_size, test_size = test_size,frac=f)
df_train_copula_nf95, df_test_copula_nf95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_copula_nf95,train_size = train_size, test_size = test_size,frac=f) 
df_train_ipf95, df_test_ipf95 = Build_datasetsets(df_real=df_real95,df_real_excluded = df_excluded,syn_data = df_ipf95,train_size = train_size, test_size = test_size,frac=f)

## Classification

In [149]:
# function to prepare training and testing set
def Prepare_Xtrain_Xtest(df_training,df_test):
    scaler = StandardScaler()

    Xtrain = df_training.drop(columns='label')
    Ytrain = df_training.label

    Xtest = df_test.drop(columns='label')
    Ytest = df_test.label

    Xtrain = scaler.fit_transform(Xtrain)
    Xtest = scaler.transform(Xtest)

    return Xtrain, Ytrain, Xtest, Ytest

In [150]:
Xtrain_nfvae95,Ytrain_nfvae95,Xtest_nfvae95,Ytest_nfvae95 = Prepare_Xtrain_Xtest(df_train_nfvae95, df_test_nfvae95)
Xtrain_ablation95,Ytrain_ablation95,Xtest_ablation95,Ytest_ablation95 = Prepare_Xtrain_Xtest(df_train_copulaablation95, df_test_copulaablation95)
Xtrain_copula_nf95,Ytrain_copula_nf95,Xtest_copula_nf95,Ytest_copula_nf95 = Prepare_Xtrain_Xtest(df_train_copula_nf95, df_test_copula_nf95)
Xtrain_ipf95,Ytrain_ipf95,Xtest_ipf95,Ytest_ipf95 = Prepare_Xtrain_Xtest(df_train_ipf95, df_test_ipf95)

In [151]:
# puttingn data inside dictionary
data_dict = {'ipf95':[Xtrain_ipf95,Ytrain_ipf95,Xtest_ipf95,Ytest_ipf95],
             'ablation95':[Xtrain_ablation95,Ytrain_ablation95,Xtest_ablation95,Ytest_ablation95],
             'copula_nf95':[Xtrain_copula_nf95,Ytrain_copula_nf95,Xtest_copula_nf95,Ytest_copula_nf95],
             'nf_vae95':[Xtrain_nfvae95,Ytrain_nfvae95,Xtest_nfvae95,Ytest_nfvae95]}

In [152]:
def Privacy_Table(data_dict,metric='f1_score',random_state=42):
    
    # initialize result dataframe
    res = pd.DataFrame(columns=['model','ipf95','ablation95','copula_nf95','nf_vae95'])

    # logistic regression
    res.loc[0,'model'] = 'Logistic Regression'
    for k in data_dict.keys():
        log_reg = LogisticRegression(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[0,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[0,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[0,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[0,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[0,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[0,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)

    # gaussian Naive Baise
    res.loc[1,'model'] = 'Gaussian Naive Bayes'
    for k in data_dict.keys():
        log_reg = GaussianNB()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[1,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[1,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[1,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[1,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[1,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[1,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
            
    # KNeighbours
    res.loc[2,'model'] = 'KNeighbours'
    for k in data_dict.keys():
        log_reg = KNeighborsClassifier()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[2,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[2,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[2,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[2,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[2,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[2,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Decision Tree
    res.loc[3,'model'] = 'Decision Tree'
    for k in data_dict.keys():
        log_reg = DecisionTreeClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[3,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[3,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[3,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[3,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[3,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[3,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Random Forest
    res.loc[4,'model'] = 'Random Forest'
    for k in data_dict.keys():
        log_reg = RandomForestClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[4,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[4,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[4,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[4,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[4,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[4,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    

    # Random Forest
    res.loc[5,'model'] = 'SVC'
    for k in data_dict.keys():
        log_reg = SVC()
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[5,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[5,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[5,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[5,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[5,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[5,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    

    # MLP classifier
    res.loc[6,'model'] = 'MLP'
    for k in data_dict.keys():
        log_reg = MLPClassifier(random_state=random_state)
        log_reg.fit(data_dict[k][0],data_dict[k][1])
        y_pred = log_reg.predict(data_dict[k][2])
        if(metric == 'f1_score'):
            res.loc[6,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[6,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[6,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[6,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[6,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[6,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Majority classifier
    res.loc[7,'model'] = 'Majority classifier'
    for k in data_dict.keys():
        y_pred = np.zeros(shape=(len(data_dict[k][3]),1))
        if(metric == 'f1_score'):
            res.loc[7,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[7,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[7,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[7,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[7,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[7,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    
    # Random classifier
    np.random.seed(random_state)
    res.loc[8,'model'] = 'Random classifier'
    for k in data_dict.keys():
        y_pred = np.random.rand(len(data_dict[k][3]))
        y_pred[y_pred>=0.5]=1
        y_pred[y_pred<0.5]=0
        if(metric == 'f1_score'):
            res.loc[8,k] = np.round(f1_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'accuracy'):
            res.loc[8,k] = np.round(accuracy_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'precision'):
            res.loc[8,k] = np.round(precision_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'recall'):
            res.loc[8,k] = np.round(recall_score(y_true=data_dict[k][3], y_pred= y_pred),2)
        elif(metric == 'ROC-AUC'):
            res.loc[8,k] = np.round(roc_auc_score(y_true=data_dict[k][3], y_score=y_pred),2)
        elif(metric == 'AUC-PR'):
            res.loc[8,k] = np.round(average_precision_score(y_true=data_dict[k][3], y_score=y_pred),2)
    


    return res



In [153]:
res_aur_pr = Privacy_Table(data_dict=data_dict,metric='AUC-PR')
res_aur_pr

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.35,0.2,0.22,0.39
1,Gaussian Naive Bayes,0.23,0.2,0.2,0.39
2,KNeighbours,0.48,0.55,0.42,0.57
3,Decision Tree,0.87,0.37,0.48,0.43
4,Random Forest,0.78,0.53,0.46,0.62
5,SVC,0.39,0.39,0.31,0.63
6,MLP,0.55,0.7,0.55,0.64
7,Majority classifier,0.2,0.2,0.2,0.2
8,Random classifier,0.22,0.21,0.18,0.19


In [154]:
res_f1 = Privacy_Table(data_dict=data_dict)
res_f1

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.41,0.0,0.05,0.38
1,Gaussian Naive Bayes,0.37,0.32,0.33,0.38
2,KNeighbours,0.59,0.7,0.59,0.66
3,Decision Tree,0.91,0.53,0.63,0.6
4,Random Forest,0.84,0.62,0.55,0.73
5,SVC,0.38,0.44,0.3,0.71
6,MLP,0.62,0.8,0.7,0.77
7,Majority classifier,0.0,0.0,0.0,0.0
8,Random classifier,0.34,0.31,0.21,0.26


In [155]:
res_precision = Privacy_Table(data_dict=data_dict,metric='precision')
res_precision

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.75,0.0,1.0,1.0
1,Gaussian Naive Bayes,0.24,0.2,0.2,1.0
2,KNeighbours,0.8,0.69,0.5,0.92
3,Decision Tree,1.0,0.51,0.68,0.55
4,Random Forest,1.0,0.91,0.82,0.87
5,SVC,1.0,0.81,0.8,0.96
6,MLP,0.95,0.86,0.69,0.82
7,Majority classifier,0.0,0.0,0.0,0.0
8,Random classifier,0.24,0.22,0.15,0.18


In [156]:
res_recall = Privacy_Table(data_dict=data_dict,metric='recall')
res_recall

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.28,0.0,0.02,0.23
1,Gaussian Naive Bayes,0.77,0.93,0.95,0.23
2,KNeighbours,0.47,0.72,0.72,0.51
3,Decision Tree,0.84,0.56,0.58,0.67
4,Random Forest,0.72,0.47,0.42,0.63
5,SVC,0.23,0.3,0.19,0.56
6,MLP,0.47,0.74,0.72,0.72
7,Majority classifier,0.0,0.0,0.0,0.0
8,Random classifier,0.58,0.58,0.37,0.44


In [157]:
res_accuracy = Privacy_Table(data_dict=data_dict,metric='accuracy')
res_accuracy

Unnamed: 0,model,ipf95,ablation95,copula_nf95,nf_vae95
0,Logistic Regression,0.84,0.8,0.8,0.85
1,Gaussian Naive Bayes,0.47,0.21,0.2,0.85
2,KNeighbours,0.87,0.88,0.8,0.89
3,Decision Tree,0.97,0.8,0.86,0.82
4,Random Forest,0.94,0.88,0.86,0.91
5,SVC,0.85,0.85,0.83,0.91
6,MLP,0.89,0.92,0.88,0.91
7,Majority classifier,0.8,0.8,0.8,0.8
8,Random classifier,0.54,0.49,0.45,0.48


ipf --> 