In [27]:
import pandas as pd
import numpy as np
from model.preprocess import clean_and_engineer, split_xy

In [28]:
df = pd.read_csv("../data/listings_dec18.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36662 entries, 0 to 36661
Data columns (total 96 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                36662 non-null  int64  
 1   listing_url                       36662 non-null  object 
 2   scrape_id                         36662 non-null  int64  
 3   last_scraped                      36662 non-null  object 
 4   name                              36650 non-null  object 
 5   summary                           35641 non-null  object 
 6   space                             25052 non-null  object 
 7   description                       36115 non-null  object 
 8   experiences_offered               36662 non-null  object 
 9   neighborhood_overview             21617 non-null  object 
 10  notes                             14907 non-null  object 
 11  transit                           22096 non-null  object 
 12  acce

  df = pd.read_csv("../data/listings_dec18.csv")


In [29]:
cleaned = clean_and_engineer(df)
X, y = split_xy(cleaned)
X.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,minimum_nights,availability_365,number_of_reviews,review_scores_rating,reviews_per_month,room_type,property_type,neighbourhood_cleansed,cancellation_policy,instant_bookable
0,2,1.0,1.0,1.0,2,187,493,95.0,4.83,Private room,Townhouse,Sydney,strict_14_with_grace_period,f
1,6,3.0,3.0,3.0,5,321,1,100.0,0.03,Entire home/apt,House,Manly,strict_14_with_grace_period,f
2,2,1.0,1.0,1.0,2,316,300,88.0,3.63,Private room,Apartment,Sydney,strict_14_with_grace_period,t
3,8,2.0,4.0,4.0,7,69,15,96.0,0.18,Entire home/apt,House,Leichhardt,strict_14_with_grace_period,f
4,2,1.0,1.0,1.0,1,140,42,94.0,0.45,Private room,Apartment,Woollahra,moderate,f


In [30]:
y.head()

0    4.615121
1    6.156979
2    4.700480
3    6.111467
4    4.143135
Name: log_price, dtype: float64

In [32]:
X.columns.to_list()

['accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'minimum_nights',
 'availability_365',
 'number_of_reviews',
 'review_scores_rating',
 'reviews_per_month',
 'room_type',
 'property_type',
 'neighbourhood_cleansed',
 'cancellation_policy',
 'instant_bookable']

In [36]:
df["property_type"].value_counts().shape[0]

40

In [37]:
df["neighbourhood_cleansed"].nunique()

38

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

from model.preprocess import NUM_FEATURES, CAT_FEATURES

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", NUM_FEATURES),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
    ]
)

In [43]:
model = RandomForestRegressor(
    n_estimators=400,
    max_depth=20,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE (log scale): {rmse:.4f}")

RMSE (log scale): 0.4246




In [46]:
from xgboost import XGBRegressor


model = XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.1,
    random_state=42,
    tree_method="hist",
    n_jobs=-1
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

pipeline.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

print("CV RMSE:", -scores.mean())
preds = pipeline.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE (log scale): {rmse:.4f}")

CV RMSE: 0.41241986465882385
RMSE (log scale): 0.4065




In [47]:
type(X)

pandas.core.frame.DataFrame

In [48]:
type(y)

pandas.core.series.Series