In [1]:
from warnings import simplefilter

import numpy as np
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from sklearn_helpers import (
    ResultContainer,
    fit_models,
    get_column_transformer,
    get_models,
    get_preprocessor,
    show_coefficients,
)

simplefilter(action="ignore", category=FutureWarning)
pd.set_option("precision", 3)


In [2]:
# NOTE: For Experimentation we train model on the entire data set without splitting in training and test set
listings_extended = pd.read_pickle("../data-clean/listings_extended.pkl")
X = listings_extended.drop(columns="price")
y = listings_extended["price"]


In [3]:
# BOOKMARK: Hyperparameters
random_state = 42
n_folds = 10
log_y = True

# 131 total encoded features in listings_extended
num_features_list = [10, 20, 50, 100, 131]


In [4]:
column_transformer = get_column_transformer()


In [5]:
# SUBSECTION: Analyze Performance for different values of num_features
result_list = []
for num_features in num_features_list:
    rfe = RFE(SVR(kernel="linear"), n_features_to_select=num_features, step=0.5)
    preprocessor = get_preprocessor(column_transformer, rfe)
    models = get_models(
        preprocessor, models=["linear"], random_state=random_state, log_y=log_y
    )
    result_container = ResultContainer()

    result = fit_models(
        X,
        y,
        models,
        result_container,
        n_folds,
        random_state=random_state,
        log_y=log_y,
    )
    result_list.append(result.display_df())

collected_results = pd.concat(result_list)


Fitting LinearRegression
Finished training in 121.21 seconds
Fitting LinearRegression
Finished training in 128.92 seconds
Fitting LinearRegression
Finished training in 147.87 seconds
Fitting LinearRegression
Finished training in 112.93 seconds
Fitting LinearRegression


  return func(X, **(kw_args if kw_args else {}))
Traceback (most recent call last):
  File "C:\Users\admin\miniconda3\envs\airbnb\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\admin\miniconda3\envs\airbnb\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\admin\miniconda3\envs\airbnb\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\admin\miniconda3\envs\airbnb\lib\site-packages\sklearn\metrics\_regression.py", line 789, in r2_score
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "C:\Users\admin\miniconda3\envs\airbnb\lib\site-packages\sklearn\metrics\_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  Fi

Finished training in 72.54 seconds


In [6]:
collected_results


Unnamed: 0,mae_train,mae_val,r2_train,r2_val,mse_train,mse_val,hyperparam_keys,hyperparam_values,num_features,feature_selector,log_y
LinearRegression,472.063,476.466,0.215,0.225,2453000.0,2529000.0,,,10,RFE,True
LinearRegression,462.616,470.317,0.233,0.301,2397000.0,2412000.0,,,20,RFE,True
LinearRegression,446.691,458.079,0.25,0.31,2343000.0,2396000.0,,,50,RFE,True
LinearRegression,442.66,465.907,0.27,0.236,2282000.0,2514000.0,,,100,RFE,True
LinearRegression,431.583,,0.32,,2124000.0,,,,131,RFE,True


In [7]:
# SUBSECTION: Analyze Coefficients for different values of num_features
num_features = 20
rfe = RFE(SVR(kernel="linear"), n_features_to_select=num_features, step=0.5)
preprocessor = get_preprocessor(column_transformer, rfe)
model = LinearRegression()

pipeline = make_pipeline(preprocessor, model)
log_transform = TransformedTargetRegressor(pipeline, func=np.log, inverse_func=np.exp)

log_transform.fit(X, y)
show_coefficients(log_transform)


Unnamed: 0,feature,coefficient
0,property_type_Houseboat,3.335
1,bathrooms_text_4.5 baths,2.022
2,bathrooms_text_10 baths,1.748
3,property_type_Private room in boat,1.031
4,property_type_Private room in guest suite,0.976
5,maximum_nights_avg_ntm,0.9
6,bathrooms_text_2.5 baths,0.809
7,property_type_Entire villa,0.759
8,property_type_Entire chalet,0.743
9,room_type_Entire home/apt,0.637
