In [1]:
# get rid of the annoying ipython import cache
%load_ext autoreload
%autoreload 2

In [2]:
# Python
from __future__ import annotations
# Internal
from research.core.preprocessing import custom
# External
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Typing
from typing import TYPE_CHECKING
if TYPE_CHECKING:
    from typing import List, Tuple, Any

In [3]:
# Load data objects (clean, optimized)
poi: 'pd.DataFrame' = pd.read_pickle('core/loading/poi.pkl.gzip')
listings: 'pd.DataFrame' = pd.read_pickle('core/loading/listings.pkl.gzip')
transformers: 'List[Tuple[str, Any, List[str]]]' = list()

## Missing Values
Check how many missings values do we have.
Looks like the data is quite clean, the only missings are in the 'cheques' (transacted is a target variable).

In [4]:
# Handle missing values
listings.isna().sum()[listings.isna().sum() != 0]

cheques       41557
transacted       12
dtype: int64

In [5]:
poi.isna().sum()[poi.isna().sum() != 0]

Series([], dtype: int64)

## Cheques
Cheques is non-negative, it makes perfect sense to represent absence of data as 0's.

In [6]:
listings['cheques'].describe()

count    18443.000000
mean         2.900233
std          2.236097
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         12.000000
Name: cheques, dtype: float64

Start building the transformation pipeline. That is important to have pipeline object (pretrained)
ready, to use it later for model selection and on the production server.

Here, at first we impute 0's. Then leave the column untouched (to use it as a feature).
Any scaling will be applied AFTER feature engineering step (as on of the latest steps of the pipeline).

In [7]:
transformers.append(('cheques',
                     custom.PipelineWithNames(
                         [('impute', custom.SimpleImputerWithNames(strategy='constant', fill_value=0)),
                          ('untouched', custom.UntouchedTransformer())]
                     ),
                     ['cheques']))
transformers

[('cheques',
  PipelineWithNames(steps=[('impute',
                            SimpleImputerWithNames(fill_value=0,
                                                   strategy='constant')),
                           ('untouched', UntouchedTransformer())]),
  ['cheques'])]

## From-To
Add new features:
- How many days the listing was valid
- Which month the property was added to the list
- Which quarter the property was added to the list

In [8]:
transformers.append(('timing',
                    custom.TimingTransformer(start='valid_from', end='valid_to'),
                    ['valid_from', 'valid_to']))
custom.TimingTransformer(start='valid_from', end='valid_to')\
    .fit_transform(listings[['valid_from', 'valid_to']]).head(5)

Unnamed: 0_level_0,days,quarter,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6406935,8,4,10
5949553,106,4,10
6389258,2,4,10
6467389,45,4,10
6479131,23,4,10


## Offer Type
Offer type is not ordinal (offer type 1 is not worse than type 2)
OneHotEncoding is required

In [9]:
transformers.append(('offer_type',
                     OneHotEncoder(sparse=False, handle_unknown='error'),
                     ['offer_type']))
OneHotEncoder(sparse=False, handle_unknown='error').fit_transform(listings[['offer_type']])

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

## Bedrooms & Bathrooms
Bedrooms and bathrooms are definitely ordinal (2 is better than 1)
Add 3 features:
- Bathrooms
- Bedrooms
- Bathrooms/Bedrooms ratio

In [10]:
transformers.append(('bedrooms',
                     custom.UntouchedTransformer(),
                     ['bedrooms']))
transformers.append(('bathrooms',
                     custom.UntouchedTransformer(),
                     ['bathrooms']))
transformers.append(('bathrooms_bedrooms',
                     custom.BathBedRatioTransformer('bathrooms', 'bedrooms'),
                     ['bathrooms', 'bedrooms']))
custom.BathBedRatioTransformer('bathrooms', 'bedrooms')\
    .fit_transform(listings[['bathrooms', 'bedrooms']]).head(5)

Unnamed: 0_level_0,ratio
id,Unnamed: 1_level_1
6406935,1.333333
5949553,1.0
6389258,1.0
6467389,1.5
6479131,1.5


## SqFt
The feature stays unchanged

In [11]:
transformers.append(('sqft',
                     custom.UntouchedTransformer(),
                     ['sqft']))

## GeoSpatial
Area, most likely, affects the price.
Use Uber's H3 hexagons to split by area. (More on h3: https://github.com/uber/h3)
Add 4 new features:
- Hex-Resolution 6
- Hex-Resolution 7
- Hex-Resolution 8
- Hex-Resolution 9
Example of the hexagons: https://observablehq.com/@sw1227/h3-index-visualizer

In [12]:
transformers.append(('h3_areas',
                     custom.H3AreasTransformer(),
                     ['latitude', 'longitude']))
custom.H3AreasTransformer().fit_transform(listings[['latitude', 'longitude']]).head(5)

Unnamed: 0_level_0,hexagon_6,hexagon_7,hexagon_8,hexagon_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6406935,604672105132326911,609175704692588543,613679304317861887,618182903943397375
5949553,604672104863891455,609175704407375871,613679304030552063,618182903656611839
6389258,604672103790149631,609175703367188479,613679302990364671,618182902617473023
6467389,604672900909236223,609176500435943423,613680100061216767,618183699687276543
6479131,604672105266544639,609175704776474623,613679304401747967,618182904027283455


## Price per SqFt per Area
! Attention ! Here we must be very very careful. Aggregations with target variables is
one of the MOST frequent reasons of Data Leaks.
For example in the pipeline we compute Global Average for the whole Training Set. Then, during
the Cross Validation, the information about the validation sets is gonna be leaked (coz it's already stored
in the aggregation - average).
I've seen this mistake so many times on Kaggle.

In [13]:
transformers.append(('h3_ppa',
                     custom.H3PricePerAreaTransformer('price', 'sqft'),
                     ['latitude', 'longitude', 'price', 'sqft']))
custom.H3PricePerAreaTransformer('price', 'sqft')\
    .fit_transform(listings[['latitude', 'longitude', 'price', 'sqft']]).head(5)

Unnamed: 0_level_0,hexagon_6_ppa,hexagon_7_ppa,hexagon_8_ppa,hexagon_9_ppa
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6406935,1163.0,1120.0,1184.0,2278.0
5949553,4321.0,5716.0,18031.0,1150.0
6389258,542.0,445.0,509.0,511.0
6467389,1141.0,898.0,937.0,566.0
6479131,1225.0,856.0,1477.0,1498.0


## Objects Nearby
We have list of important objects (metro, tram etc).
Add new features: how many important objects are nearby (in the same Hexagon with the property)

In [14]:
transformers.append(('h3_nearby',
                     custom.H3NearbyTransformer(),
                     ['latitude', 'longitude']))
custom.H3NearbyTransformer().fit_transform(listings[['latitude', 'longitude']]).head(5)

Unnamed: 0_level_0,landmark_hexagon_6,metro_hexagon_6,tram_hexagon_6,landmark_hexagon_7,metro_hexagon_7,tram_hexagon_7,landmark_hexagon_8,metro_hexagon_8,tram_hexagon_8,landmark_hexagon_9,metro_hexagon_9,tram_hexagon_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6406935,3,5,0,1,0,0,0,0,0,0,0,0
5949553,0,0,0,0,0,0,0,0,0,0,0,0
6389258,0,0,0,0,0,0,0,0,0,0,0,0
6467389,0,4,0,0,0,0,0,0,0,0,0,0
6479131,2,3,0,0,0,0,0,0,0,0,0,0


## Pipeline

#### Transformers

In [15]:
transformers

[('cheques',
  PipelineWithNames(steps=[('impute',
                            SimpleImputerWithNames(fill_value=0,
                                                   strategy='constant')),
                           ('untouched', UntouchedTransformer())]),
  ['cheques']),
 ('timing',
  TimingTransformer(end='valid_to', start='valid_from'),
  ['valid_from', 'valid_to']),
 ('offer_type', OneHotEncoder(sparse=False), ['offer_type']),
 ('bedrooms', UntouchedTransformer(), ['bedrooms']),
 ('bathrooms', UntouchedTransformer(), ['bathrooms']),
 ('bathrooms_bedrooms',
  BathBedRatioTransformer(bathrooms='bathrooms', bedrooms='bedrooms'),
  ['bathrooms', 'bedrooms']),
 ('sqft', UntouchedTransformer(), ['sqft']),
 ('h3_areas', H3AreasTransformer(), ['latitude', 'longitude']),
 ('h3_ppa',
  H3PricePerAreaTransformer(price='price', sqft='sqft'),
  ['latitude', 'longitude', 'price', 'sqft']),
 ('h3_nearby', H3NearbyTransformer(), ['latitude', 'longitude'])]

#### Final Pipeline & Feature Names

In [16]:
pipeline = ColumnTransformer(transformers, remainder='drop', n_jobs=-1)
pipeline.fit(listings)
pipeline.get_feature_names()

['cheques__untouched',
 'timing__days',
 'timing__quarter',
 'timing__month',
 'offer_type__x0_1',
 'offer_type__x0_2',
 'bedrooms__untouched',
 'bathrooms__untouched',
 'bathrooms_bedrooms__ratio',
 'sqft__untouched',
 'h3_areas__hexagon_6',
 'h3_areas__hexagon_7',
 'h3_areas__hexagon_8',
 'h3_areas__hexagon_9',
 'h3_ppa__hexagon_6_ppa',
 'h3_ppa__hexagon_7_ppa',
 'h3_ppa__hexagon_8_ppa',
 'h3_ppa__hexagon_9_ppa',
 'h3_nearby__landmark_hexagon_6',
 'h3_nearby__metro_hexagon_6',
 'h3_nearby__tram_hexagon_6',
 'h3_nearby__landmark_hexagon_7',
 'h3_nearby__metro_hexagon_7',
 'h3_nearby__tram_hexagon_7',
 'h3_nearby__landmark_hexagon_8',
 'h3_nearby__metro_hexagon_8',
 'h3_nearby__tram_hexagon_8',
 'h3_nearby__landmark_hexagon_9',
 'h3_nearby__metro_hexagon_9',
 'h3_nearby__tram_hexagon_9']

#### Example of features

In [17]:
pd.DataFrame(pipeline.transform(listings), columns=pipeline.get_feature_names())

Unnamed: 0,cheques__untouched,timing__days,timing__quarter,timing__month,offer_type__x0_1,offer_type__x0_2,bedrooms__untouched,bathrooms__untouched,bathrooms_bedrooms__ratio,sqft__untouched,...,h3_nearby__tram_hexagon_6,h3_nearby__landmark_hexagon_7,h3_nearby__metro_hexagon_7,h3_nearby__tram_hexagon_7,h3_nearby__landmark_hexagon_8,h3_nearby__metro_hexagon_8,h3_nearby__tram_hexagon_8,h3_nearby__landmark_hexagon_9,h3_nearby__metro_hexagon_9,h3_nearby__tram_hexagon_9
0,0.0,8.0,4.0,10.0,1.0,0.0,3.0,4.0,1.333333,1362.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,106.0,4.0,10.0,1.0,0.0,7.0,7.0,1.000000,16000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2.0,4.0,10.0,1.0,0.0,6.0,6.0,1.000000,3600.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.0,45.0,4.0,10.0,0.0,1.0,2.0,3.0,1.500000,981.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,23.0,4.0,10.0,1.0,0.0,2.0,3.0,1.500000,860.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0.0,8.0,4.0,10.0,1.0,0.0,6.0,5.0,0.833333,4909.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59996,0.0,23.0,4.0,10.0,0.0,1.0,4.0,4.0,1.000000,1900.0,...,7.0,0.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,0.0
59997,0.0,20.0,4.0,10.0,0.0,1.0,2.0,2.0,1.000000,1603.0,...,7.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
59998,0.0,4.0,4.0,10.0,0.0,1.0,3.0,4.0,1.333333,1670.0,...,7.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Dump

In [18]:
joblib.dump(pipeline, 'core/preprocessing/pipeline.joblib', protocol=4)

['core/preprocessing/pipeline.joblib']

## What else could be done here?
Possible features:
- Address
- Decompositors: PCA, ICA, tSVD, GRP
- Distance in KM to important objects (metro etc). However in this case input space will grow
to O(nxm), where n - properties, m - important objects
- Distance (travel time) by car/foot/bike/public_transport to important objects.
Travel time can be obtained from public API's.
- It feels like there is something more to do with Timing (valid_from valid_to)
- More Hexagons...  

Possible improvements:
- Optimize pipeline time
- Refactor code in 'custom'