In [1]:
from warnings import simplefilter

import numpy as np
import pandas as pd
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR

from sklearn_helpers import (
    ResultContainer,
    fit_models,
    get_column_transformer,
    get_models,
    get_preprocessor,
    show_coefficients,
)

simplefilter(action="ignore", category=FutureWarning)
pd.set_option("precision", 3)
pd.set_option('display.max_columns', 100)


In [2]:
# NOTE: For Experimentation we train model on the entire data set without splitting in training and test set
listings_extended = pd.read_pickle("../data-clean/listings_extended.pkl")
X = listings_extended.drop(columns="price")
y = listings_extended["price"]

X.shape

(2601, 39)

In [3]:
# BOOKMARK: Hyperparameters
random_state = 42
n_folds = 10
log_y = True

# 113 total encoded features in listings_extended
# fitting with all 113 features leads to error of evaluating metrics
num_features_list = [10, 25, 50, 75]


In [4]:
column_transformer = get_column_transformer()


In [5]:
# SUBSECTION: Analyze Performance for different values of num_features
result_list = []
for num_features in num_features_list:
    if num_features is None:
        preprocessor = column_transformer
    else:
        rfe = RFE(SVR(kernel="linear"), n_features_to_select=num_features, step=0.5)
        preprocessor = get_preprocessor(column_transformer, rfe)
        
    models = get_models(
        preprocessor, models=["linear"], random_state=random_state, log_y=log_y
    )
    result_container = ResultContainer()

    result = fit_models(
        X,
        y,
        models,
        result_container,
        n_folds,
        random_state=random_state,
        log_y=log_y,
    )
    result_list.append(result.display_df())

collected_results = pd.concat(result_list)


Fitting LinearRegression
Finished training in 98.04 seconds
Fitting LinearRegression
Finished training in 47.95 seconds
Fitting LinearRegression
Finished training in 52.08 seconds
Fitting LinearRegression
Finished training in 30.44 seconds


In [6]:
collected_results.sort_values("mae_val")


Unnamed: 0,mae_train,mae_val,r2_train,r2_val,mse_train,mse_val,hyperparam_keys,hyperparam_values,num_features,feature_selector,log_y
LinearRegression,463.118,480.451,0.221,0.128,2666000.0,2877000.0,,,50,RFE,True
LinearRegression,462.224,482.792,0.213,0.073,2694000.0,2949000.0,,,75,RFE,True
LinearRegression,473.546,487.056,0.219,0.122,2672000.0,2864000.0,,,25,RFE,True
LinearRegression,485.634,493.983,0.215,0.188,2686000.0,2808000.0,,,10,RFE,True


In [10]:
# SUBSECTION: Analyze Coefficients for different values of num_features
num_features = 100
rfe = RFE(SVR(kernel="linear"), n_features_to_select=num_features, step=0.5)
preprocessor = get_preprocessor(column_transformer, rfe)
model = LinearRegression()

pipeline = make_pipeline(preprocessor, model)
log_transform = TransformedTargetRegressor(pipeline, func=np.log, inverse_func=np.exp)

log_transform.fit(X, y)
coefs = show_coefficients(log_transform)


Unnamed: 0,feature,coefficient
0,host_total_listings_count,5.075e+12
1,neighbourhood_Frogner,1.856e+12
2,room_type_Shared room,9.399e+11
3,room_type_Entire home/apt,9.399e+11
4,room_type_Private room,9.399e+11
...,...,...
95,neighbourhood_Grünerløkka,-4.009e+11
96,host_identity_verified_t,-5.966e+11
97,host_identity_verified_f,-5.966e+11
98,neighbourhood_cleansed_Frogner,-1.856e+12


In [13]:
# predictions from CNN appear completely useless
coefs.loc[coefs["feature"] == "cnn_predictions"]

Unnamed: 0,feature,coefficient
47,cnn_predictions,0.01


In [8]:
X["property_type"].value_counts()


Entire rental unit                     1210
Entire condominium (condo)              567
Private room in rental unit             349
Entire residential home                 111
Private room in condominium (condo)      89
Entire townhouse                         58
Entire loft                              52
Private room in residential home         34
Entire villa                             21
Private room in loft                     15
Shared room in rental unit               14
Entire serviced apartment                12
Private room                             10
Entire guest suite                        7
Entire guesthouse                         6
Private room in villa                     6
Private room in townhouse                 6
Shared room in condominium (condo)        6
Private room in bed and breakfast         5
Entire cabin                              4
Tiny house                                4
Camper/RV                                 3
Private room in boat            

In [9]:
listings_extended.loc[listings_extended["property_type"] == "Houseboat"]

Unnamed: 0_level_0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,number_of_reviews_ltm,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,property_type,accommodates,bedrooms,beds,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,number_of_reviews_l30d,instant_bookable,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,number_bathrooms,shared_bathrooms,host_gender,number_amenities,number_front_page_pictures,cnn_predictions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
24616982,Frogner,Entire home/apt,30000,2,0,18,180,0,f,20,20,t,t,Frogner,Houseboat,8,4,6,5,2,2,5,5,2.0,5.0,t,30,60,90,0,f,18,0,0,5.0,False,male,16,5,981.03
