In [1]:
import sys
sys.path += ["../src"]
import utils
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib.pyplot import subplots as sbp 
from importlib import reload
import jl_vae
# import jl_nflows_geo_coordinates_2 as nfg
# from jl_nflows_geo_coordinates import load_nf as load_dict

from _51_abm_functions import cod_prov_abbrv_df

# Global Spatial Autocorrelation
from spatial_autocorrelation import get_moransI, moransI_scatterplot, hypothesis_testing
# Local Spatial Autocorrelation
from spatial_autocorrelation import get_localMoransI, LISA_scatterplot
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# transform one-hot encoding to categories
def add_cat_features(df):
    df["energy_class"] = df[[u for u in df.columns if "_energy" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]
    df["COD_CAT"] = [u[8:] for u in df[[u for u in df.columns if "COD_CAT_" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    df["anno_costruzione"] = [u[17:] for u in df[[u for u in df.columns if "ANNO_COSTRUZIONE" in u]].stack().rename("col").reset_index().query("col == 1")["level_1"]]
    return df

In [3]:
# import dictionary with data {'hydro_risk', 'census', 'omi_og', 'cap'}
# takes ~25seconds
geo_dict = jl_vae.load_geo_data()

# check which provinces are done
glob(jl_vae.path_pop_synth + f"95sample/pop_samples/*")

['/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703LT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703FI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703BT.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703SI.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703CO.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703RN.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703FC.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_full_250703PE.csv',
 '/data/housing/data/intermediate/jl_pop_synth/95sample/pop_samples/synthetic_pop_

In [4]:
# paths of the synthetic and real populations

# choose the province
prov = "AN" # AN
# get the cod prov (number)
cod_prov = cod_prov_abbrv_df.query("prov_abbrv == @prov")["COD_PROV"].item()

# all trained models have these settings
date_nf = "241203"
date_vae = "240107"
date_vae = "250709price_"
date_95 = "250703"
date_data = "250110"

real_pops = jl_vae.path_pop_synth + f"pop_samples/pop_real_with_hedonic_price"
# synth_pops = jl_vae.path_pop_synth + f"pop_samples/pop_synth_with_hedonic_price_250110"
synth_pops = jl_vae.path_pop_synth + f"pop_samples/"

vae_data = "full"

synth_pops95 = jl_vae.path_pop_synth + f"95sample/pop_samples"

Note: I tried the classification on the Bologna and Milano provinces. I got error in the cell below due to the fact that df_real has more rows than df_sample95... problably that is a stuff we have to fix

In [5]:
# df_real = pd.read_csv(real_pops + f"/pop_real_full_250110{prov}.csv", index_col = 0)
df_real = jl_vae.get_df_prov(prov, dropna = False, add_cols = ["log_price"])
# df_sample = pd.read_csv(synth_pops + f"/pop_synth_full_250110{prov}.csv", index_col = 0).query("prov_abbrv == @prov")
# use this for the good syn pop
#df_sample = pd.read_csv(synth_pops + f"/synthetic_pop_full_{date_vae}{prov}.csv", index_col = 0)


In [34]:

df_copula_nf = pd.read_csv(jl_vae.path_pop_synth + f"copula_samples/df_copula_nf_{prov}.csv", index_col = 0)
df_sample = (utils.spatial_matching_ABM(df_copula_nf,
                            hydro_risk = geo_dict["hydro_risk"],
                            census = geo_dict["census"],
                            omi_og = geo_dict["omi_og"],
                            cap = geo_dict["cap"])
                            .rename(columns = {"GEO_LONGITUDINE_BENE_ROUNDED":"x", "GEO_LATITUDINE_BENE_ROUNDED":"y"})
                            .query("prov_abbrv == @prov").sample(n = len(df_real), random_state = 79))
df_sample95 = df_sample.sample(frac = 0.95, random_state = 11)
df_excluded = df_sample.loc[~df_sample.index.isin(df_sample95.index)]

#df_sample = add_cat_features(df_sample)
#df_sample95 = add_cat_features(df_sample95)


In [36]:
df_sample95.columns 

Index(['flag_garage', 'flag_pertinenza', 'flag_air_conditioning',
       'flag_multi_floor', 'log_mq', 'ANNO_COSTRUZIONE_1500_1965',
       'ANNO_COSTRUZIONE_1965_1985', 'ANNO_COSTRUZIONE_1985_2005',
       'ANNO_COSTRUZIONE_2005_2025', 'ANNO_COSTRUZIONE_Missing',
       'High_energy_class', 'Low_energy_class', 'Medium_energy_class',
       'Missing_energy_class', 'COD_CAT_A02', 'COD_CAT_A03',
       'COD_CAT_A_01_07_08', 'COD_CAT_A_04_05', 'floor_0.0', 'floor_1.0',
       'floor_2.0', 'floor_3.0', 'floor_Missing', 'floor_plus_4',
       'flag_air_conditioning_Missing', 'flag_multi_floor_Missing',
       'log_price', 'x', 'y', 'COD_CONTRATTO', 'SEZ2011', 'PRO_COM',
       'flag_geo_valid', 'CAP', 'OMI_id', 'OMI_categ', 'scenario',
       'scenario_LowRisk', 'scenario_MediumRisk', 'scenario_NoRisk',
       'scenario_HighRisk', 'scenario_Risk', 'flag_italy', 'COD_PROV',
       'COD_REG', 'regione_nome', 'prov_nome', 'prov_abbrv',
       'year_erogaz_prov'],
      dtype='object')

In [37]:
df_real.columns

Index(['flag_garage', 'flag_pertinenza', 'flag_air_conditioning',
       'flag_multi_floor', 'flag_geo_valid', 'y', 'x', 'log_mq',
       'ANNO_COSTRUZIONE_1500_1965', 'ANNO_COSTRUZIONE_1965_1985',
       'ANNO_COSTRUZIONE_1985_2005', 'ANNO_COSTRUZIONE_2005_2025',
       'ANNO_COSTRUZIONE_Missing', 'High_energy_class', 'Low_energy_class',
       'Medium_energy_class', 'Missing_energy_class', 'COD_CAT_A02',
       'COD_CAT_A03', 'COD_CAT_A_01_07_08', 'COD_CAT_A_04_05', 'floor_0.0',
       'floor_1.0', 'floor_2.0', 'floor_3.0', 'floor_Missing', 'floor_plus_4',
       'x_norm', 'y_norm', 'flag_air_conditioning_Missing',
       'flag_multi_floor_Missing'],
      dtype='object')

In [None]:

# spatial matching to assign geographical features to synthetic data
# this select the observations within province borders
df_sample95 = pd.read_csv(synth_pops95 + f"/synthetic_pop_full_{date_95}{prov}.csv", index_col = 0)
df_sample95 = (utils.spatial_matching_ABM(df_sample95,
                                           hydro_risk = geo_dict["hydro_risk"], 
                                           census = geo_dict["census"], 
                                           omi_og = geo_dict["omi_og"], 
                                           cap = geo_dict["cap"])
                .rename(columns = {"GEO_LONGITUDINE_BENE_ROUNDED": "x", 
                                   "GEO_LATITUDINE_BENE_ROUNDED": "y"})
                .query("prov_abbrv == @prov")
                                           )
    

df_real = add_cat_features(df_real)
df_sample = add_cat_features(df_sample).sample(n = len(df_real), random_state = 79)
df_sample95 = add_cat_features(df_sample95).sample(n = len(df_real), random_state = 79)

In [6]:
df_sample95

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,ANNO_COSTRUZIONE_1985_2005,ANNO_COSTRUZIONE_2005_2025,ANNO_COSTRUZIONE_Missing,...,flag_italy,COD_PROV,COD_REG,regione_nome,prov_nome,prov_abbrv,year_erogaz_prov,energy_class,COD_CAT,anno_costruzione
5806,True,False,False,False,4.549366,False,False,True,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A02,1985_2005
490,False,False,False,False,4.765153,True,False,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A03,1500_1965
891,False,False,False,False,4.614405,False,True,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A03,1965_1985
3155,False,False,False,False,4.765072,False,True,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,Missing_energy_class,A03,1965_1985
706,True,False,False,False,4.844382,False,False,True,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A02,1985_2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5822,False,False,False,False,4.599750,True,False,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A02,1500_1965
2505,True,False,False,False,5.039247,False,True,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A02,1965_1985
3376,False,True,False,False,4.482504,True,False,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A03,1500_1965
2874,False,False,False,False,4.628155,False,True,False,False,False,...,1,042,11,Marche,Ancona,AN,2024_042,,A02,1965_1985


In [38]:
df_real = (df_real[[u for u in jl_vae.cols if u not in ["prov_abbrv"]] + ["x_norm", "y_norm"]]
           .assign(flag_air_conditioning_Missing = lambda x: x["flag_air_conditioning"] == "Missing",
                   flag_multi_floor_Missing = lambda x: x["flag_multi_floor"] == "Missing")
                   .replace("Missing",0).astype(float))

# this is the fraction of real data that I used to train the 0.95 model 
df_real95 = df_real.sample(frac = 0.95, random_state = 1111)

# real observations that have not been used to train the model
df_real_excluded = df_real.loc[~df_real.index.isin(df_real95.index)].drop(columns = ["x_norm", "y_norm", "flag_geo_valid"])

# keep all dataset with the same format
df_sample_col = df_sample[df_real_excluded.columns] + 0.
df_sample95_col = df_sample95[df_real_excluded.columns] + 0.

In [8]:
df_real95

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,flag_geo_valid,y,x,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,...,floor_0.0,floor_1.0,floor_2.0,floor_3.0,floor_Missing,floor_plus_4,x_norm,y_norm,flag_air_conditioning_Missing,flag_multi_floor_Missing
82281,1.0,1.0,0.0,0.0,1.0,43.726,13.163,5.220356,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.170616,0.912281,1.0,1.0
83648,1.0,0.0,0.0,0.0,1.0,43.533,13.402,4.955827,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.395735,0.065789,1.0,1.0
86005,0.0,0.0,0.0,0.0,1.0,43.339,12.911,4.477337,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.767773,-0.785088,0.0,1.0
82269,1.0,1.0,0.0,0.0,1.0,43.626,13.296,5.247024,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.144550,0.473684,1.0,1.0
83760,1.0,0.0,0.0,0.0,1.0,43.471,13.512,4.812184,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.656398,-0.206140,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83758,0.0,1.0,0.0,0.0,1.0,43.613,13.429,4.919981,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.459716,0.416667,1.0,1.0
83423,0.0,0.0,0.0,0.0,1.0,43.543,13.392,5.252273,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.372038,0.109649,1.0,1.0
82013,1.0,0.0,0.0,0.0,1.0,43.524,13.394,4.406719,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.376777,0.026316,1.0,1.0
84339,0.0,0.0,0.0,0.0,1.0,43.608,13.368,4.543295,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.315166,0.394737,1.0,1.0


In [9]:
df_real_excluded

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,y,x,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,ANNO_COSTRUZIONE_1985_2005,...,COD_CAT_A_01_07_08,COD_CAT_A_04_05,floor_0.0,floor_1.0,floor_2.0,floor_3.0,floor_Missing,floor_plus_4,flag_air_conditioning_Missing,flag_multi_floor_Missing
82015,0.0,0.0,0.0,0.0,43.599,13.517,4.276666,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
82019,1.0,0.0,0.0,0.0,43.344,12.919,4.736198,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
82032,0.0,0.0,0.0,0.0,43.614,13.531,4.644391,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
82044,0.0,0.0,0.0,0.0,43.600,13.516,4.804021,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
82085,0.0,0.0,0.0,0.0,43.539,13.069,4.787492,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86193,1.0,0.0,0.0,0.0,43.609,13.452,4.007333,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
86217,1.0,0.0,0.0,0.0,43.525,13.226,4.709530,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
86240,1.0,0.0,0.0,0.0,43.590,13.488,5.036953,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86258,1.0,0.0,0.0,0.0,43.625,13.382,4.875197,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_sample95_col

Unnamed: 0,flag_garage,flag_pertinenza,flag_air_conditioning,flag_multi_floor,y,x,log_mq,ANNO_COSTRUZIONE_1500_1965,ANNO_COSTRUZIONE_1965_1985,ANNO_COSTRUZIONE_1985_2005,...,COD_CAT_A_01_07_08,COD_CAT_A_04_05,floor_0.0,floor_1.0,floor_2.0,floor_3.0,floor_Missing,floor_plus_4,flag_air_conditioning_Missing,flag_multi_floor_Missing
5806,1.0,0.0,0.0,0.0,43.493725,13.544527,4.549366,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
490,0.0,0.0,0.0,0.0,43.560276,13.514369,4.765153,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
891,0.0,0.0,0.0,0.0,43.611180,13.526014,4.614405,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3155,0.0,0.0,0.0,0.0,43.603264,13.480945,4.765072,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
706,1.0,0.0,0.0,0.0,43.599426,13.334040,4.844382,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5822,0.0,0.0,0.0,0.0,43.624150,13.401386,4.599750,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2505,1.0,0.0,0.0,0.0,43.339783,12.903398,5.039247,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3376,0.0,1.0,0.0,0.0,43.617180,13.507017,4.482504,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2874,0.0,0.0,0.0,0.0,43.622658,13.508151,4.628155,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


# Preparation of training and testing sets

Syntehtic populations

In [39]:
# get synthehtic homes for the testing
df_syn_test = df_sample95_col.sample(n=len(df_real_excluded),random_state=42,replace = False)
df_syn_training = df_sample95_col.drop(df_syn_test.index)

df_syn_test.reset_index(drop=True,inplace=True)
df_syn_training.reset_index(drop=True,inplace=True)

# adding the label: 1 if the home is real and 0 if synthetic
df_syn_test['label'] = 0
df_syn_training['label'] = 0

Real homes

In [40]:
# drop xnorm and ynorm
df_real95 = df_real95.drop(columns=['x_norm','y_norm','flag_geo_valid'])

# get synthehtic homes for the testing
df_real_test = df_real_excluded.reset_index(drop=True)
df_real_training = df_real95.reset_index(drop=True)

# adding the label: 1 if the home is real and 0 if synthetic
df_real_test['label'] = 1
df_real_training['label'] = 1

In [41]:
# getting 80% of syn homes and 20% of real homes
# check for rpeated homes (avoid it)!!!
df_train_syn = df_syn_training.sample(n=int(np.round(len(df_syn_training)*0.80)),random_state=42).reset_index(drop=True)
df_test_syn = df_syn_test.sample(n=int(np.round(len(df_syn_test)*0.80)),random_state=42).reset_index(drop=True)

df_train_real = df_real_training.sample(n=int(np.round(len(df_real_training)*0.20)),random_state=42).reset_index(drop=True)
df_test_real = df_real_test.sample(n=int(np.round(len(df_real_test)*0.20)),random_state=42).reset_index(drop=True)

Concatenate syntehtic and real homes dataframes

In [42]:
df_training = pd.concat([df_train_syn,df_train_real])
df_test = pd.concat([df_test_syn,df_test_real])

# Classification

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score


In [44]:
# scaling delle features
scaler = StandardScaler()

Xtrain = df_training.drop(columns='label')
Ytrain = df_training.label

Xtest = df_test.drop(columns='label')
Ytest = df_test.label

Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [45]:
# random state
random_state = 42

# logistic regression
log_reg = LogisticRegression(random_state=random_state)
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("Logistic Regression, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res = pd.DataFrame(columns=['Model','accuracy','precision','f1','recall','AUC-ROC','AUC_PR'])
df_res.loc[0,:] = ['Logistic regression', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]


Logistic Regression, Accuracy: 1.0 , Precision: 1.0 , F1 score 1.0 , recall:  1.0


In [46]:
# gaussian Naive Baise
log_reg = GaussianNB()
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("Naive Bayes, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[1,:] = ['Gaussian naive bayes', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

Naive Bayes, Accuracy: 1.0 , Precision: 1.0 , F1 score 0.99 , recall:  0.98


In [47]:
# KNeighborsClassifier

log_reg = KNeighborsClassifier()
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("KNeighbors, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[2,:] = ['K-Neighbors', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

KNeighbors, Accuracy: 0.97 , Precision: 0.95 , F1 score 0.93 , recall:  0.91


In [48]:
# DecisionTreeClassifier
log_reg = DecisionTreeClassifier(random_state=random_state)
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("DecisionTreeClassifier, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[3,:] = ['Decision Tree', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

DecisionTreeClassifier, Accuracy: 1.0 , Precision: 1.0 , F1 score 1.0 , recall:  1.0


In [49]:
# RandomForestClassifier

log_reg = RandomForestClassifier(random_state=random_state)
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("RandomForestClassifier, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[4,:] = ['Random Forest', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

RandomForestClassifier, Accuracy: 1.0 , Precision: 1.0 , F1 score 1.0 , recall:  1.0


In [50]:
# get the importance of the features for the random forest classifier
importance =  log_reg.feature_importances_


feature_names = df_training.drop(columns='label').columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print(importance_df)

                          Feature  Importance
27       flag_multi_floor_Missing    0.549234
26  flag_air_conditioning_Missing    0.264703
5                               x    0.059636
4                               y    0.038180
6                          log_mq    0.021004
13               Low_energy_class    0.006318
8      ANNO_COSTRUZIONE_1965_1985    0.005466
17                    COD_CAT_A03    0.005363
15           Missing_energy_class    0.005211
10     ANNO_COSTRUZIONE_2005_2025    0.005164
14            Medium_energy_class    0.005150
16                    COD_CAT_A02    0.004779
9      ANNO_COSTRUZIONE_1985_2005    0.004740
7      ANNO_COSTRUZIONE_1500_1965    0.004596
24                  floor_Missing    0.003270
12              High_energy_class    0.003256
0                     flag_garage    0.003186
19                COD_CAT_A_04_05    0.002244
23                      floor_3.0    0.002024
1                 flag_pertinenza    0.001792
20                      floor_0.0 

In [51]:
# SVC

log_reg = SVC()
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("SVC, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[5,:] = ['SVC', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

SVC, Accuracy: 1.0 , Precision: 1.0 , F1 score 0.99 , recall:  0.98


In [52]:
# MLPClassifier

log_reg = MLPClassifier(random_state=random_state)
log_reg.fit(Xtrain,Ytrain)
y_pred = log_reg.predict(Xtest)


print("MLPClassifier, Accuracy:", np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
      ", Precision:", np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
      ", F1 score", np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
      ", recall: ", np.round(recall_score(y_true=Ytest, y_pred= y_pred),2))
#print(classification_report(Ytest, y_pred))

df_res.loc[6,:] = ['MLP', np.round(accuracy_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(precision_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(f1_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(recall_score(y_true=Ytest, y_pred= y_pred),2),
                    np.round(roc_auc_score(y_true=Ytest, y_score=y_pred),2),
                    np.round(average_precision_score(y_true=Ytest, y_score=y_pred),2)]

MLPClassifier, Accuracy: 1.0 , Precision: 1.0 , F1 score 1.0 , recall:  1.0


In [53]:
df_res

Unnamed: 0,Model,accuracy,precision,f1,recall,AUC-ROC,AUC_PR
0,Logistic regression,1.0,1.0,1.0,1.0,1.0,1.0
1,Gaussian naive bayes,1.0,1.0,0.99,0.98,0.99,0.98
2,K-Neighbors,0.97,0.95,0.93,0.91,0.95,0.88
3,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0
4,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0
5,SVC,1.0,1.0,0.99,0.98,0.99,0.98
6,MLP,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
# results comparing the "right" stuff

df_res

Unnamed: 0,Model,accuracy,precision,f1,recall,AUC-ROC,AUC_PR
0,Logistic regression,0.85,1.0,0.44,0.28,0.64,0.42
1,Gaussian naive bayes,0.85,1.0,0.38,0.23,0.62,0.39
2,K-Neighbors,0.9,1.0,0.66,0.49,0.74,0.59
3,Decision Tree,0.83,0.57,0.62,0.7,0.78,0.46
4,Random Forest,0.94,0.94,0.85,0.77,0.88,0.77
5,SVC,0.91,1.0,0.7,0.53,0.77,0.63
6,MLP,0.92,0.86,0.8,0.74,0.86,0.7


# Check on the square meters

In [26]:
df_check_real = df_training.loc[df_training.label == 1,:]
df_check_syn = df_training.loc[df_training.label == 0,:]

In [27]:
df_check_real[['log_mq','x','y']].describe()

Unnamed: 0,log_mq,x,y
count,808.0,808.0,808.0
mean,4.73753,13.363488,43.55772
std,0.382325,0.200157,0.095178
min,3.465736,12.849,43.31
25%,4.51086,13.22,43.49275
50%,4.70953,13.427,43.585
75%,4.943422,13.515,43.616
max,6.416732,13.657,43.746


In [28]:
df_check_syn[['log_mq','x','y']].describe()

Unnamed: 0,log_mq,x,y
count,3231.0,3231.0,3231.0
mean,4.771897,13.363188,43.565128
std,0.168227,0.179679,0.088092
min,4.418053,12.796176,43.29761
25%,4.649727,13.230255,43.50377
50%,4.747623,13.400167,43.59367
75%,4.862836,13.509582,43.617815
max,5.6746,13.651083,43.7454
