In [1]:
import pandas as pd
import numpy as np
import torch
import sys
sys.path += ["../src"]
import jl_vae
import jl_nflows_geo_coordinates_2 as nfg
from jl_nflows_geo_coordinates import load_nf as load_dict
import utils
import config
import pickle
from tqdm import tqdm
from time import time
import geopandas as gpd
import gzip
import json
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
from scipy.special import expit as sigmoid

In [2]:
num_layers_nf = 32
hidden_features_nf = 32
num_epochs_nf = 20000
lr_nf_ = 10
lr_nf = lr_nf_ / 1000000
opt_nf = "Adam" 
flow_name = "NSF_CL"
batch_dim_nf = 100
_ = "nf"

latent_dims_vae = 20
epochs_vae = 100
middle_hidden_dims_vae = [64, 32]
lr_vae = 0.001
opt_vae = "Adam"

# geo_data_dict = jl_vae.load_geo_data()
# abm_path = "ISP_data_for_ABM/ISP_ABM_up_to_2024_08_0001.csv"
# data = pd.read_csv(jl_vae.path_intermediate + abm_path, index_col = 0)


In [3]:
def read_gz(city, fn):
    return pd.read_csv(gzip.open(config.INTERMEDIATE_DATA_PATH + "airbnb/" + city + "/" + fn, mode='rb'))
def read_json(city):
    path_city = config.INTERMEDIATE_DATA_PATH + "airbnb/" + city
    gdf = gpd.read_file(path_city + "/neighbourhoods.geojson")
    return gdf


In [51]:
city = "copenhagen"
df_full = read_gz(city, fn = "/listings.csv.gz")
# gdf_full = read_json(city)
df_full["amenities"] = [json.loads(u) for u in df_full["amenities"]]
amenities_vars = ["Air conditioning", "Elevator", "Self check-in", "Pets allowed", "Private living room", "Backyard", "Pool"]
int_vars = ['accommodates', 'bathrooms','bedrooms', 'beds']

for var in amenities_vars:
    df_full[var] = [var in u for u in df_full["amenities"]]
df = df_full[['id', #'neighbourhood_cleansed', 
              'latitude', 'longitude', #'property_type',
              'room_type', 'accommodates', 'bathrooms',
              'bedrooms', 'beds', 'price',
              'review_scores_rating', 'reviews_per_month'] + amenities_vars].rename(columns = {"latitude": "y", "longitude": "x"}).dropna()
df["bathrooms"] = df["bathrooms"].astype(int)
df["price"] = [float(u[1:].split(".")[0].replace(",","")) for u in df["price"]]
df["log_price"] = np.log(df["price"])
df = pd.concat([df.drop(columns = ["room_type"]), pd.get_dummies(df["room_type"], prefix = "roomtype")], axis = 1).set_index("id")
scaler = MinMaxScaler((-1,1))
scaler.fit(np.array(df[["x", "y"]]))
df[["x_norm", "y_norm"]] = scaler.transform(np.array(df[["x", "y"]]))

In [50]:
df[int_vars] = sigmoid(df[int_vars])

In [5]:
model_settings = {"num_layers_nf": num_layers_nf, "hidden_features_nf": hidden_features_nf, 
                  "num_epochs_nf": num_epochs_nf, "lr_nf": lr_nf, "opt_nf": opt_nf, "flow_name": flow_name,
                  "batch_dim": batch_dim_nf}


In [6]:
airbnb_path = "/data/housing/data/intermediate/jl_pop_synth/airbnb/"
date = "250703"
city = "copenhagen"
path_nf = airbnb_path + f"nf_models/{city}_{date}.pkl"

In [7]:
nf_dict = load_dict(path_nf)

In [8]:
from jl_synthetic_pop_all_vae_provinces_airbnb import get_df_city_for_nf

In [52]:
# df = get_df_city_for_nf(city)

In [53]:
inv_coord = nf_dict["flow"].flow.forward(torch.tensor(np.array(df[["x_norm", "y_norm"]]), dtype = torch.float32))

In [54]:
df[["x_latent", "y_latent"]] = torch.sigmoid(inv_coord[0][-1]).detach().numpy()

In [90]:
import seaborn as sns

In [124]:
scaler_df = MinMaxScaler((-1,1))
scaler_df.fit(np.array(df))
df_scaled = scaler_df.transform(np.array(df))
df_scaled = pd.DataFrame(df_scaled, columns = df.columns, index = df.index)

In [41]:
from scipy.special import expit as sigmoid
from scipy.special import logit

In [89]:
df_scaled = df.drop(columns = ["price"]).copy()
# df_scaled[int_vars + ["price", "review_scores_rating", "reviews_per_month"]] = sigmoid(df_scaled[int_vars + ["price", "review_scores_rating", "reviews_per_month"]])

In [91]:
hidden_dims_vae = [df_scaled.drop(columns = ["x", "y", "x_norm", "y_norm"]).shape[1]] + middle_hidden_dims_vae 

In [92]:
vae = jl_vae.VariationalAutoencoder(full_df = df_scaled.astype(np.float32),
                                    latent_dims = 15,
                                            hidden_dims = hidden_dims_vae,
                                                            )
                        
vae.train(epochs = epochs_vae,
    lr = lr_vae,
    hide_tqdm = False,
    verbose = False, 
    opt_name = opt_vae,
    batch_size = 100,
    weight_reconstruction = 10, #6,
    weight_kl = 0.1,
    weights_geo = 30, # 20,
    kl_annealing = False
            )

100%|██████████| 100/100 [01:14<00:00,  1.35it/s]


In [79]:
out = vae.encoder(torch.tensor(np.array(df_scaled.drop(columns = ["x", "y", "x_norm", "y_norm"]).astype(np.float32))))

In [87]:
(out < -100000000).sum(axis = 0)

tensor([ 388,  123, 1479,    0,    0,  478,   54, 5307, 3485, 4740, 3377,  484,
        2265,    0, 1321])

In [81]:
vae.decoder(out)

tensor([[        nan,         nan,         nan,  ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan,  ...,         nan,
                 nan,         nan],
        [ 1.6089e+06,  8.7789e+06, -6.5103e+06,  ..., -1.5371e+07,
          7.9808e+06, -4.3822e+06],
        ...,
        [ 9.5992e+21, -8.2911e+21, -4.6615e+21,  ..., -3.4763e+21,
         -1.0228e+22,  9.8333e+21],
        [ 9.2071e+14,  5.0239e+15, -3.7250e+15,  ..., -8.7983e+15,
          4.5696e+15, -2.5091e+15],
        [ 1.5964e+07,  8.7107e+07, -6.4594e+07,  ..., -1.5253e+08,
          7.9200e+07, -4.3490e+07]], grad_fn=<AddmmBackward0>)

In [93]:
df_sample_ = vae.get_sample_from_vae()

In [106]:
df_sample_[int_vars]["bedrooms"].astype(int).value_counts(normalize = True)

bedrooms
1    0.450020
0    0.232083
2    0.170826
3    0.094622
4    0.034566
5    0.012945
6    0.004404
7    0.000400
9    0.000133
Name: proportion, dtype: float64

In [107]:
df[int_vars]["bedrooms"].astype(int).value_counts(normalize = True)

bedrooms
1    0.574095
2    0.244990
3    0.124288
4    0.035944
0    0.012686
5    0.006251
6    0.001471
7    0.000184
8    0.000092
Name: proportion, dtype: float64

In [138]:
inv_sample = scaler_df.inverse_transform(df_sample_)
df_sample = pd.DataFrame(inv_sample, columns = df.columns)

In [36]:
import seaborn as sns

In [None]:
df_sample[["accomodates", "bathrooms", "bedrooms", "beds"]] = df_sample[["accomodates", "bathrooms", "bedrooms", "beds"]].astype(int)

Unnamed: 0,y,x,accommodates,bathrooms,bedrooms,beds,price,review_scores_rating,reviews_per_month,Air conditioning,...,Backyard,Pool,Entire home/apt,Hotel room,Private room,Shared room,x_norm,y_norm,y_latent,x_latent
0,55.628178,12.492668,3.102011,0.507049,0.075355,12.673265,771.743408,0.952903,-0.138725,-0.032224,...,1.000240,-0.007602,-0.001853,0.000134,0.679684,0.708434,-0.166264,-0.281264,0.424970,0.363533
1,55.642300,12.498774,4.490563,0.926947,0.055022,12.538538,732.994934,1.088478,93.107170,1.010842,...,1.002436,-0.007031,0.000046,-0.002292,0.880003,0.895071,0.580282,0.520012,0.788264,0.754004
2,55.637516,12.496356,4.083116,0.773008,0.123996,12.596318,812.294983,1.041429,1.371406,0.001127,...,0.999104,-0.001330,-0.000957,0.003067,0.754288,0.580771,-0.676917,0.017151,0.176469,0.508954
3,55.644543,12.497350,4.636491,0.971065,-0.010907,12.693157,23.995668,0.953632,93.859100,1.014754,...,1.005349,-0.020015,-0.009963,-0.016447,0.835607,0.819201,0.276803,0.342429,0.640581,0.667465
4,55.633369,12.496119,3.688221,0.681465,0.141201,12.620402,1356.391724,1.045996,15.631618,0.002905,...,0.990801,0.007052,-0.000144,0.008761,0.857430,0.579600,-0.681601,0.429719,0.174189,0.710003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619,55.642090,12.497743,4.222061,0.857634,0.126474,12.352087,2399.484131,1.160419,88.709236,0.983852,...,0.991797,0.006941,0.013612,0.010400,0.868479,0.702834,-0.188662,0.473917,0.414070,0.731541
1620,55.652241,12.502990,5.779087,1.294213,0.210562,12.405333,2183.365234,1.077652,17.190065,0.624362,...,0.997362,0.003960,0.004296,0.010585,0.597210,0.882578,0.530313,-0.611159,0.763948,0.202772
1621,55.634197,12.496964,3.862152,0.699840,0.147825,12.579370,1410.565796,1.066888,2.059445,0.007608,...,0.996468,0.003835,0.006077,0.006290,0.940688,0.525480,-0.898080,0.762753,0.068843,0.872294
1622,55.638100,12.496852,4.058230,0.818055,0.110173,12.506594,1651.319702,1.029127,0.955438,0.985603,...,0.995735,-0.002673,0.002451,0.003762,0.613202,0.905813,0.623253,-0.547192,0.809176,0.233944


In [110]:
pd.read_csv(airbnb_path + f"pop_samples/synthetic_pop_{city}_{date}.csv", index_col = 0)

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,review_scores_rating,reviews_per_month,Air conditioning,Elevator,Self check-in,Pets allowed,...,roomtype_Entire home/apt,roomtype_Hotel room,roomtype_Private room,roomtype_Shared room,y_latent,x_latent,x_norm,y_norm,x,y
1,2.069774,0.966806,1.029504,1.217879,5.141025,3.194059,0.026138,0.200122,0.483326,0.192440,...,True,False,False,False,0.374365,0.562070,0.106442,-0.036400,12.558126,55.671940
2,1.345204,0.872945,0.733440,0.563242,3.948029,2.158900,0.007136,0.164630,0.225772,0.027561,...,False,False,True,False,0.252977,0.495285,-0.073467,-0.047292,12.541697,55.671303
4,3.617707,0.961710,1.644988,2.268407,3.894419,0.872584,0.041048,0.181730,0.280642,0.047198,...,True,False,False,False,0.563936,0.729683,0.385850,0.070440,12.583639,55.678180
7,1.467247,0.776385,0.821175,0.750313,3.853836,3.679423,0.103956,0.221951,0.227547,-0.035672,...,False,False,True,False,0.493160,0.622182,0.248394,-0.005737,12.571088,55.673730
9,2.615838,0.979543,1.155541,1.406724,5.181335,1.120210,0.036340,0.178226,0.249108,0.098058,...,True,False,False,False,0.635039,0.356778,0.027666,0.282533,12.550932,55.690567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6.069078,1.287053,2.673795,3.336741,5.744091,1.078309,0.007226,0.452841,0.445423,0.196759,...,True,False,False,False,0.285316,0.943165,0.227656,-0.393630,12.569195,55.651077
99996,2.404950,1.233630,1.085642,1.105358,5.699028,0.921092,0.013085,0.151212,0.373500,0.164312,...,True,False,False,False,0.555972,0.361802,0.025188,0.244919,12.550707,55.688370
99997,2.337443,1.017903,1.141207,1.084507,4.492704,1.432993,-0.013006,0.187943,0.245530,0.041458,...,True,False,False,False,0.211480,0.540806,-0.086144,-0.067847,12.540540,55.670100
99998,4.933835,1.301402,2.167798,3.256877,5.690404,0.741627,-0.017109,0.021801,0.407036,0.171002,...,True,False,False,False,0.741049,0.804006,0.499901,0.099478,12.594053,55.679874
