In [28]:
import nltk
import numpy as np
import pandas as pd
import torch
from nltk.corpus import stopwords
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, strip_accents_unicode
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)

from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from skorch.callbacks import EarlyStopping, EpochScoring
from skorch.dataset import ValidSplit
from torch.optim import Adam

from custom_metrics import median_absolute_percentage_error
from custom_transformers import ColumnTransformerUnion, ListColumnExpander
from neural_networks import Module, NeuralNet

In [29]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lfbittencourt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
df = pd.read_json("dataset.json.gz")

df["is_apartment"] = (df["type"] == "Apartamento").astype(int)

df.drop(columns=["type", "images"], inplace=True)

## Data cleansing

In [31]:
df.describe()

Unnamed: 0,id,area,rooms,bathrooms,lat,lng,price,is_apartment
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,40272.228,122.526,2.431,1.967,-30.05275,-51.191685,678021.7,0.759
std,23700.082581,103.443307,0.898914,1.193036,0.040888,0.035456,671251.4,0.427904
min,77.0,20.0,0.0,1.0,-30.20908,-51.261095,128500.0,0.0
25%,19102.25,59.0,2.0,1.0,-30.071449,-51.220507,270000.0,1.0
50%,40529.0,85.0,2.0,2.0,-30.040894,-51.195782,467250.0,1.0
75%,61272.0,150.0,3.0,3.0,-30.02561,-51.167297,800000.0,1.0
max,81563.0,1000.0,7.0,7.0,-29.962873,-51.091757,4900000.0,1.0


In [32]:
def get_outliers_mask(series, threshold=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1

    return (series < q1 - threshold * iqr) | (series > q3 + threshold * iqr)

In [33]:
# POA bounding box from https://boundingbox.klokantech.com/
min_lng, min_lat, max_lng, max_lat = -51.30344, -30.26945, -51.018852, -29.932474

# Price outliers were already cleaned in the previous notebook, so we are
# cleaning them twice.
to_drop = df[
    (df["lat"] < min_lat)
    | (df["lat"] > max_lat)
    | (df["lng"] < min_lng)
    | (df["lng"] > max_lng)
    | get_outliers_mask(df["price"])
    | get_outliers_mask(df["area"])
    | get_outliers_mask(df["bathrooms"])
    | get_outliers_mask(df["rooms"])
    | ~df["description"].str.contains(r"\w")
]

df.drop(to_drop.index, inplace=True)
pd.set_option("display.float_format", lambda x: "%.2f" % x)
df.describe()

Unnamed: 0,id,area,rooms,bathrooms,lat,lng,price,is_apartment
count,880.0,880.0,880.0,880.0,880.0,880.0,880.0,880.0
mean,41427.43,95.15,2.28,1.73,-30.05,-51.19,501298.36,0.81
std,23493.95,56.08,0.78,0.9,0.04,0.04,318049.32,0.4
min,77.0,23.0,1.0,1.0,-30.21,-51.26,128500.0,0.0
25%,20409.5,57.0,2.0,1.0,-30.07,-51.22,250000.0,1.0
50%,41859.5,76.0,2.0,1.0,-30.04,-51.2,399445.0,1.0
75%,62106.75,118.0,3.0,2.0,-30.02,-51.17,666250.0,1.0
max,81563.0,286.0,4.0,5.0,-29.96,-51.09,1590000.0,1.0


In [34]:
pd.reset_option("display.float_format")

## Location grid

In [35]:
grid_width, grid_height = 28, 38

lat_range = max_lat - min_lat
lng_range = max_lng - min_lng

lat_step = lat_range / grid_height
lng_step = lng_range / grid_width


# Taken from https://www.geeksforgeeks.org/find-excel-column-name-given-number/
def x_index_to_excel_column_name(x_index):
    alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

    if x_index < 26:
        return alpha[x_index - 1]
    else:
        q, r = x_index // 26, x_index % 26

        if r == 0:
            if q == 1:
                return alpha[r - 1]
            else:
                return x_index_to_excel_column_name(q - 1) + alpha[r - 1]
        else:
            return x_index_to_excel_column_name(q) + alpha[r - 1]


def get_location_cell(lat, lng):
    x_index = int((lng - min_lng) / lng_step)
    y_index = int((max_lat - lat) / lat_step)

    return f"{x_index_to_excel_column_name(x_index)}{y_index + 1}"

In [36]:
df["location_cell"] = df.apply(lambda x: get_location_cell(x["lat"], x["lng"]), axis=1)

df["location_cell"].value_counts()

location_cell
G12    32
K13    30
L13    19
J13    19
K10    19
       ..
L19     1
I23     1
K7      1
G4      1
I21     1
Name: count, Length: 159, dtype: int64

In [37]:
# Count values grouped by lat and lon
df.groupby(["lat", "lng"])["id"].count().reset_index(name="count").sort_values(
    by="count", ascending=False
)

Unnamed: 0,lat,lng,count
278,-30.056336,-51.198619,4
312,-30.049088,-51.214419,3
729,-30.013019,-51.162839,3
330,-30.047394,-51.183647,3
85,-30.113945,-51.217168,3
...,...,...,...
283,-30.054739,-51.147234,1
284,-30.054664,-51.222296,1
285,-30.054591,-51.180459,1
286,-30.054516,-51.161557,1


## Dataset stats

In [38]:
df

Unnamed: 0,id,area,rooms,bathrooms,lat,lng,description,price,image_weights,is_apartment,location_cell
1,79099,180,3,2,-30.026097,-51.130486,"Sobrado em condomínio fechado, 3 dormitórios, ...",790000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,Q11
2,35320,126,2,2,-30.063438,-51.208355,"CASA TIPO SOBRADO, com 2 pavimentos com possib...",360500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,I15
3,24136,55,1,1,-30.030183,-51.225697,Apartamento à venda em Porto Alegre no bairro ...,291000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,G12
4,64786,100,2,1,-30.106666,-51.223042,"Casa com 2 dormitórios, 1 banheiro social e 4 ...",350000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,G20
5,24812,170,2,3,-30.132100,-51.223583,Casa em condomínio fechado com baixo valor con...,430000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,G23
...,...,...,...,...,...,...,...,...,...,...,...
995,26419,200,3,3,-30.113767,-51.257435,Ótima residencia próxima a beira do Guaíba com...,1100000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,D21
996,41184,75,2,1,-30.091434,-51.237856,Apartamento bem ventilado de 2 dormitórios com...,350000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,F18
997,13850,49,2,1,-30.081404,-51.220110,Ótimo apartamento com 2 dormitórios á venda no...,169000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,H17
998,66735,61,2,1,-30.108442,-51.248857,Apartamento em ótima localização próximo a tod...,349000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,E20


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 880 entries, 1 to 999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             880 non-null    int64  
 1   area           880 non-null    int64  
 2   rooms          880 non-null    int64  
 3   bathrooms      880 non-null    int64  
 4   lat            880 non-null    float64
 5   lng            880 non-null    float64
 6   description    880 non-null    object 
 7   price          880 non-null    int64  
 8   image_weights  880 non-null    object 
 9   is_apartment   880 non-null    int64  
 10  location_cell  880 non-null    object 
dtypes: float64(2), int64(6), object(3)
memory usage: 82.5+ KB


In [40]:
apartments_ratio = df["is_apartment"].sum() / len(df)

print(f"Ratio of apartments: {apartments_ratio:.2%}")

Ratio of apartments: 80.68%


In [41]:
df.groupby("is_apartment")[["area", "rooms", "bathrooms", "price"]].mean()

Unnamed: 0_level_0,area,rooms,bathrooms,price
is_apartment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,163.311765,2.905882,2.435294,661880.835294
1,78.833803,2.133803,1.56338,462849.035211


In [42]:
df["is_apartment"].value_counts()

is_apartment
1    710
0    170
Name: count, dtype: int64

## Experiments

In [43]:
how_many_samples = -1  # -1 for all
k_folds = 10
verbose = 1
svd_dimensions = 30

nn_learning_rate = 0.001
nn_max_epochs = 100
nn_dropout_rate = 0
nn_verbose = 1

In [44]:
# This is my toolbelt for the pipeline

min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

stripped_stopwords = list(map(strip_accents_unicode, stopwords.words("portuguese")))
vectorizer = TfidfVectorizer(
    binary=True,
    min_df=0.001,
    stop_words=stripped_stopwords,
    strip_accents="unicode",
)

expander = ListColumnExpander()

truncater = TruncatedSVD(n_components=svd_dimensions)
truncater_double = TruncatedSVD(n_components=svd_dimensions * 2)

uniter = ColumnTransformerUnion(
    [
        (expander, "image_weights"),
        (vectorizer, "description"),
    ]
)

encoder = OneHotEncoder(handle_unknown="ignore")

In [45]:
all_modalities = {
    "structural": [
        (["area", "rooms", "bathrooms"], min_max_scaler),
        (["is_apartment"], "passthrough"),
    ],
    "location": [
        (["location_cell"], encoder),
    ],
    "image": [
        ("image_weights", expander),
    ],
    "text": [
        ("description", [vectorizer, truncater, min_max_scaler]),
    ],
    "image/text": [
        (["image_weights", "description"], [uniter, truncater_double, min_max_scaler]),
    ],
}

In [46]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

if use_cuda:
    print("Using CUDA")
else:
    print("Using CPU")

Using CPU


In [47]:
metrics = {
    "mape": make_scorer(mean_absolute_percentage_error),
    "mdape": make_scorer(median_absolute_percentage_error),
    "r2": make_scorer(r2_score),
    "mae": make_scorer(mean_absolute_error),
    "mse": make_scorer(mean_squared_error),
}

regressors = [
    RandomForestRegressor(n_jobs=-1, random_state=42),
    ExtraTreesRegressor(n_jobs=-1, random_state=42),
    # XGBRegressor(n_jobs=-1, random_state=42),
    # LGBMRegressor(n_jobs=-1, random_state=42),
    # CatBoostRegressor(random_state=42),
    NeuralNet(
        module=Module,
        optimizer=Adam,
        lr=nn_learning_rate,  # learning rate
        max_epochs=nn_max_epochs,
        device=device,
        verbose=nn_verbose,
        train_split=ValidSplit(k_folds),
        module__dropout_rate=nn_dropout_rate,
        callbacks=[
            *[EpochScoring(scoring, name=name) for name, scoring in metrics.items()],
            EarlyStopping(
                monitor="valid_loss",
                patience=10,
                threshold=0.001,  # 0,1% improvement
                threshold_mode="rel",
                load_best=True,
            ),
        ],
    ),
]

experiments = [
    ["structural"],
    ["location"],
    ["image"],
    ["text"],
    ["structural", "location"],
    ["structural", "image"],
    ["structural", "text"],
    ["structural", "location", "image"],
    ["structural", "location", "text"],
    ["structural", "image", "text"],
    ["structural", "location", "image", "text"],
    # ["structural", "location", "image/text"],
    # ["structural", "image/text"],
]

In [48]:
def get_pipeline(modalities, regressor):
    X_columns = []
    all_transformers = []

    for modality in modalities:
        for index, (columns, transformers) in enumerate(all_modalities[modality], 1):
            if isinstance(columns, list):
                X_columns.extend(columns)
            else:
                X_columns.append(columns)

            modality_name = (
                modality
                if len(all_modalities[modality]) == 1
                else f"{modality}-{index}"
            )

            if isinstance(transformers, list):
                all_transformers.append(
                    (modality_name, make_pipeline(*transformers), columns)
                )
            else:
                all_transformers.append((modality_name, transformers, columns))

    pipeline = Pipeline(
        steps=[
            (
                "transformer",
                ColumnTransformer(all_transformers),
            ),
            (
                "regressor",
                TransformedTargetRegressor(
                    regressor=regressor,
                    transformer=MinMaxScaler(),
                ),
            ),
        ]
    )

    # The set trick is to remove duplicates
    return pipeline, [*set(X_columns)]

In [58]:
pipeline, X_columns = get_pipeline(
    ["structural", "location", "image", "text"],
    regressors[0],
)

pipeline

In [50]:
X_columns

['image_weights',
 'area',
 'description',
 'is_apartment',
 'bathrooms',
 'location_cell',
 'rooms']

In [52]:
pipeline.fit(df.sample(100, random_state=42), df["price"].sample(100, random_state=42))

pipeline.steps[-1][1].n_features_in_

2207

In [53]:
if how_many_samples > 0:
    df = df.sample(how_many_samples)

In [54]:
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
data = {}

for modalities in reversed(experiments):
    for regressor in regressors:
        experiment_name = " + ".join(modalities) + " + " + regressor.__class__.__name__
        pipeline, X_columns = get_pipeline(modalities, regressor)

        print("Running experiment:", experiment_name)

        X = df[X_columns]
        y = df["price"]

        results = cross_validate(
            pipeline,
            X=X,
            y=y,
            cv=kfold,
            n_jobs=-1 if nn_verbose == 0 else None,
            verbose=verbose,
            scoring=metrics,
        )

        data[experiment_name] = [
            value
            for key, value in results.items()
            if key != "fit_time" and key != "score_time"
        ]

        print("Mean MAPE: ", np.mean(results["test_mape"]))

df_results = pd.DataFrame.from_dict(data, orient="index", columns=metrics.keys())

print("Done!")

Running experiment: structural + location + image + text + RandomForestRegressor
Mean MAPE:  0.2877240323998995
Running experiment: structural + location + image + text + ExtraTreesRegressor
Mean MAPE:  0.2774782508410103
Running experiment: structural + location + image + text + NeuralNet
  epoch     mae    mape    mdape     mse       r2    train_loss    valid_loss     dur
-------  ------  ------  -------  ------  -------  ------------  ------------  ------
      1  [36m0.1795[0m  [32m3.7479[0m   [35m0.5787[0m  [31m0.0595[0m  [94m-0.1704[0m        [36m0.0966[0m        [32m0.0595[0m  0.0422
      2  0.1989  7.0855   0.6111  [31m0.0542[0m  -0.0650        [36m0.0526[0m        [32m0.0542[0m  0.0146
      3  [36m0.1687[0m  4.8659   [35m0.5590[0m  [31m0.0469[0m  0.0786        [36m0.0470[0m        [32m0.0469[0m  0.0154
      4  [36m0.1624[0m  4.4003   [35m0.5288[0m  [31m0.0459[0m  0.0983        [36m0.0422[0m        [32m0.0459[0m  0.0156
      5  [36m0

In [55]:
compare_to = []

for metric in metrics.keys():
    agg_func = np.max if metric == "r2" else np.min
    df_results[f"{metric}_mean"] = df_results[metric].apply(lambda x: np.mean(x))
    df_results[f"{metric}_std"] = df_results[metric].apply(lambda x: np.std(x))

    for experiment in compare_to:
        df_results[f"{metric}_over_{experiment}"] = (
            df_results[f"{metric}_mean"] / df_results.loc[experiment, f"{metric}_mean"]
            - 1
        )

df_results.drop(columns=metrics.keys(), inplace=True)

df_results

Unnamed: 0,mape_mean,mape_std,mdape_mean,mdape_std,r2_mean,r2_std,mae_mean,mae_std,mse_mean,mse_std
structural + location + image + text + RandomForestRegressor,0.287724,0.036257,0.221397,0.033514,0.633334,0.06713,134502.668091,18780.784879,36364070000.0,9514310000.0
structural + location + image + text + ExtraTreesRegressor,0.277478,0.034833,0.21831,0.033151,0.647674,0.063199,130545.779398,15569.738784,34667000000.0,8146280000.0
structural + location + image + text + NeuralNet,0.280943,0.031087,0.196981,0.027872,0.664808,0.078557,125034.360707,15569.319618,32555480000.0,7556137000.0
structural + image + text + RandomForestRegressor,0.289919,0.033654,0.231966,0.026627,0.633426,0.061043,135017.54742,17365.677062,36385940000.0,9397394000.0
structural + image + text + ExtraTreesRegressor,0.286634,0.035907,0.230352,0.034032,0.635223,0.071092,133059.337284,15458.112029,35736730000.0,8001548000.0
structural + image + text + NeuralNet,0.254979,0.025451,0.201743,0.032382,0.697514,0.038762,119440.56883,11775.291443,29976430000.0,6656458000.0
structural + location + text + RandomForestRegressor,0.278893,0.033237,0.215025,0.033305,0.641366,0.074223,132172.372568,18955.058799,35450960000.0,9390152000.0
structural + location + text + ExtraTreesRegressor,0.258871,0.033815,0.208052,0.028596,0.671447,0.067263,123783.764136,16777.508323,32462760000.0,8826048000.0
structural + location + text + NeuralNet,0.278058,0.034418,0.209417,0.02341,0.687535,0.081129,123610.00498,14456.581367,30219860000.0,7440885000.0
structural + location + image + RandomForestRegressor,0.328907,0.034469,0.269121,0.026088,0.567953,0.060765,149721.376773,17218.949157,42903960000.0,10774120000.0


In [56]:
df_results.loc[:, df_results.columns.str.startswith("mape")].sort_values(by="mape_mean")

Unnamed: 0,mape_mean,mape_std
structural + image + text + NeuralNet,0.254979,0.025451
structural + location + text + ExtraTreesRegressor,0.258871,0.033815
structural + location + image + text + ExtraTreesRegressor,0.277478,0.034833
structural + location + text + NeuralNet,0.278058,0.034418
structural + location + text + RandomForestRegressor,0.278893,0.033237
structural + text + NeuralNet,0.280233,0.028589
structural + location + image + text + NeuralNet,0.280943,0.031087
structural + text + ExtraTreesRegressor,0.281798,0.032129
structural + text + RandomForestRegressor,0.283999,0.033707
structural + image + text + ExtraTreesRegressor,0.286634,0.035907


In [57]:
for metric in metrics.keys():
    agg_func = np.argmax if metric == "r2" else np.argmin

    print(f"Best {metric}:", df_results.index[agg_func(df_results[f"{metric}_mean"])])

Best mape: structural + image + text + NeuralNet
Best mdape: structural + location + image + text + NeuralNet
Best r2: structural + image + text + NeuralNet
Best mae: structural + image + text + NeuralNet
Best mse: structural + image + text + NeuralNet
