# Airbnb

Airbnb rent price forecasting.

## Library

In [1]:
from re import sub

from numpy import sqrt, mean

from pandas import read_csv, DataFrame

from matplotlib import pyplot

from sklearn import model_selection, metrics

from catboost import CatBoostRegressor, Pool

## Data

Data exploration, transformation and selection.

In [2]:
data = read_csv(
    './data/airbnb.csv',
    usecols=[
        'latitude',
        'longitude',
        'accommodates',
        'minimum_minimum_nights',
        'price'
    ],
    converters={
        'price': lambda value: float(sub(r'[^\d\-.]', '', value))
    }
)

In [3]:
data.head()

Unnamed: 0,latitude,longitude,accommodates,price,minimum_minimum_nights
0,-22.96599,-43.1794,5,350.0,5
1,-22.98405,-43.20189,2,296.0,3
2,-22.97735,-43.19105,3,387.0,2
3,-22.98839,-43.19232,2,172.0,2
4,-22.98107,-43.19136,2,260.0,3


In [4]:
data.describe()

Unnamed: 0,latitude,longitude,accommodates,price,minimum_minimum_nights
count,24549.0,24549.0,24549.0,24549.0,24549.0
mean,-22.966484,-43.247681,4.175445,1029.592285,4.933643
std,0.035107,0.096663,2.488227,5403.742297,20.130429
min,-23.07267,-43.69871,0.0,0.0,1.0
25%,-22.98473,-43.29998,2.0,250.0,2.0
50%,-22.97239,-43.19579,4.0,479.0,2.0
75%,-22.95328,-43.18598,5.0,900.0,4.0
max,-22.74969,-43.1044,16.0,650476.0,1000.0


In [5]:
data.dropna(how='any', inplace=True)

In [6]:
train, labels = data[['latitude', 'longitude', 'accommodates', 'minimum_minimum_nights']], data['price']

In [7]:
x, x_test, y, y_test = model_selection.train_test_split(train, labels, test_size=0.25, train_size=0.75)

## Model

Regression model and random hyperparameter optimization.

In [8]:
model = CatBoostRegressor(
    task_type="GPU",
    verbose=False
)

In [None]:
%%capture

grid = {
    'learning_rate': [0.001, 0.04, 0.1, 0.4],
    'depth': [4, 6, 10, 14],
    'l2_leaf_reg': [1, 3, 5, 7]
}

model.randomized_search(
    grid,
    X=x,
    y=y,
    verbose=False,
    plot=False,
)

bestTest = 3615.644635
bestIteration = 46
bestTest = 3640.953472
bestIteration = 38
bestTest = 3673.441804
bestIteration = 999
bestTest = 3620.98415
bestIteration = 69
bestTest = 3603.697009
bestIteration = 7
bestTest = 3627.915732
bestIteration = 7
bestTest = 3614.059737
bestIteration = 93
bestTest = 3615.78351
bestIteration = 46
bestTest = 3668.456053
bestIteration = 999


## Metric

In [None]:
predictions = model.predict(x_test)

In [None]:
DataFrame({
    'feature_importance': model.get_feature_importance(),
    'feature_names': x.columns,
}).sort_values(by=['feature_importance'], ascending=False)

In [None]:
train_pool = Pool(x, y)

train_pool_slice = train_pool.slice([1, 4])

prediction_diff = model.get_feature_importance(
    train_pool_slice,
    type='PredictionDiff',
    prettified=True,
    verbose=False,
)

model.plot_predictions(
    data=train_pool_slice,
    features_to_change=prediction_diff["Feature Id"],
    plot=True
)

In [None]:
rmse = sqrt(metrics.mean_squared_error(y_test, predictions))

# Lower values indicate less residual variance

print("RMSE in Y units:", mean(rmse))