<a href="https://colab.research.google.com/github/jeremysb1/xgboost/blob/main/xgboost_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I am building a model to predict prices using a regression-based approach with XGBoost.

In [1]:
import pandas as pd
summary_listings = pd.read_csv("/content/drive/MyDrive/XGBoost/listings.csv")

In [2]:
summary_listings.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                            object
dtype: object

In [3]:
summary_listings['name'].iloc[0]

'Rental unit in Sumida · ★4.77 · 1 bedroom · 2 beds · 1 bath'

Extracting information from text descriptions:

In [4]:
import re

classification_list = ['aparthotel', 'barn', 'bed and breakfast', 'boutique hotel',
                       'bungalow', 'cabin', 'camper/rv', 'chalet', 'condo', 'cottage',
                       'earthen home', 'farm stay', 'guest suite', 'guesthouse', 'home',
                       'hostel', 'hotel', 'houseboat', 'hut', 'loft', 'place to stay',
                       'rental unit', 'resort', 'ryokan', 'serviced apartment',
                       'tiny home', 'townhouse', 'treehouse', 'vacation home', 'villa']

summary_listings = summary_listings.assign(
    type_of_accommodation=(summary_listings['name']
                           .str.extract(f"({'|'.join(classification_list)})",
                                        flags=re.IGNORECASE)),
    area_of_tokyo=(summary_listings['name']
                   .str.extract(r'in\s(.*?)\s·',
                                flags=re.IGNORECASE)),
    score=(summary_listings['name']
           .str.extract(r'★(\d+\.\d+)', flags=re.IGNORECASE)
           .astype(float)),
    number_of_bedrooms=(summary_listings['name']
                        .str.extract(r'(\d+)\s*(?:bedroom|bedrooms)',
                                     flags=re.IGNORECASE)
                        .fillna(0)
                        .astype(int)),
    number_of_beds=(summary_listings['name']
                    .str.extract(r'(\d+)\s+(?:beds?\b)',
                                 flags=re.IGNORECASE)
                    .fillna(0)
                    .astype(int)),
    number_of_baths=(summary_listings['name']
                     .str.extract(r'(?P<baths>\d+)\s*(shared\s+)?(?:half-)?baths?\b',
                                  flags=re.IGNORECASE)["baths"]
                     .fillna(0)
                     .astype(int)),
)

In [5]:
import numpy as np
from datetime import datetime

summary_listings = summary_listings.assign(
    is_new = (summary_listings['name']
              .str.lower()
              .str.contains('new', case=False)
              .astype(int)),
    is_studio = (summary_listings['name']
                 .str.lower()
                 .str.contains('studio', case=False)
                 .astype(int)),
    has_shared_bath = (summary_listings['name']
                 .str.lower()
                 .str.contains('shared', case=False)
                 .astype(int)),
    has_half_bath = (summary_listings['name']
                 .str.lower()
                 .str.contains('half', case=False)
                 .astype(int)),
)

summary_listings['days_since_last_review'] = (datetime.today() - pd.to_datetime(summary_listings['last_review'])).dt.days
summary_listings['days_since_last_review'] = (summary_listings['days_since_last_review'] -
                                              summary_listings['days_since_last_review'].min())

zero_reviews = summary_listings['number_of_reviews'] == 0
ratio = summary_listings['number_of_reviews_ltm'] / summary_listings['number_of_reviews']
summary_listings['number_of_reviews_ltm'] = np.where(zero_reviews, 0, ratio)

Creating the target from daily listings:

In [6]:
calendar = pd.read_csv("/content/drive/MyDrive/XGBoost/calendar.csv")

calendar["adjusted_price"] = calendar["adjusted_price"].apply(lambda x: float(x.replace('$', '').replace(',', '')))

price_stats = (calendar.groupby('listing_id')['adjusted_price']
                       .agg(['mean', 'min', 'max']))


In [7]:
price_stats.head()

Unnamed: 0_level_0,mean,min,max
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
197677,12000.0,12000.0,12000.0
776070,8902.0,8902.0,8902.0
905944,24680.221918,23740.0,29675.0
1016831,20378.082192,15000.0,25000.0
1196177,22127.671233,4500.0,30000.0


Creating a high cardinality geographical feature:

In [8]:
def bin_2_cat(feature, bins=32):
    min_value = feature.min()
    bin_size = (feature.max() - min_value) / bins
    return ((feature - min_value) / bin_size).astype(int)

summary_listings['coordinates'] = (bin_2_cat(summary_listings['latitude']) * 1000 +
                                  bin_2_cat(summary_listings['longitude']))

print(summary_listings['coordinates'].nunique())

325


Creating a feature comparing the location of our Airbnb accommodation with the area of the Imperial Palace because of its importanceon real estate valuations.

In [9]:
imperial_palace_lat = 35.6841822633
imperial_palace_lon = 139.751471994

def degrees_to_meters(distance_degrees, latitude):
    conversion_factor = 111000
    distance_meters = (distance_degrees * conversion_factor
                       * np.cos(np.radians(latitude)))
    return distance_meters

distance_degrees = (np.abs(summary_listings['latitude'] - imperial_palace_lat) +
                    np.abs(summary_listings['longitude'] - imperial_palace_lon))

summary_listings['imperial_palace_distance'] = degrees_to_meters(distance_degrees,
                                                                 summary_listings['latitude'])