In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [39]:
df = pd.read_csv('listings_summary.csv')

# checking shape
print("The dataset has {} rows and {} columns.".format(*df.shape))

# ... and duplicates
print("It contains {} duplicates.".format(df.duplicated().sum()))

The dataset has 22552 rows and 96 columns.
It contains 0 duplicates.


In [40]:
columns_to_keep = ['neighbourhood_group_cleansed', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities',
                   'property_type', 'room_type', 'price', 'cleaning_fee', 'extra_people', 'security_deposit']

df = df[columns_to_keep]
print("The dataset has {} rows and {} columns - after dropping irrelevant columns.".format(*df.shape))

The dataset has 22552 rows and 12 columns - after dropping irrelevant columns.


In [41]:
df.dtypes

neighbourhood_group_cleansed     object
bathrooms                       float64
bedrooms                        float64
beds                            float64
bed_type                         object
amenities                        object
property_type                    object
room_type                        object
price                            object
cleaning_fee                     object
extra_people                     object
security_deposit                 object
dtype: object

In [42]:
df.isnull().sum()

neighbourhood_group_cleansed       0
bathrooms                         32
bedrooms                          18
beds                              40
bed_type                           0
amenities                          0
property_type                      0
room_type                          0
price                              0
cleaning_fee                    7146
extra_people                       0
security_deposit                9361
dtype: int64

In [43]:
# For cleaning fee and security deposit, replace nulls with 0s.
df.cleaning_fee.fillna('$0.00', inplace=True)
df.security_deposit.fillna('$0.00', inplace=True)

In [44]:
# drop the ~40 or so listings which don't include key info (bathrooms, bedrooms, beds)
df = df.dropna()

In [45]:
df['neighbourhood_group_cleansed'].value_counts().sort_values()

Spandau                      122
Marzahn - Hellersdorf        141
Reinickendorf                245
Steglitz - Zehlendorf        436
Treptow - Köpenick           593
Lichtenberg                  677
Tempelhof - Schöneberg      1554
Charlottenburg-Wilm.        1581
Neukölln                    3486
Pankow                      3532
Mitte                       4620
Friedrichshain-Kreuzberg    5485
Name: neighbourhood_group_cleansed, dtype: int64

In [46]:
from collections import Counter

In [47]:
results = Counter()
df['amenities'].str.strip('{}')\
               .str.replace('"', '')\
               .str.lstrip('\"')\
               .str.rstrip('\"')\
               .str.split(',')\
               .apply(results.update)

results.most_common(30)

[('Wifi', 21442),
 ('Kitchen', 21276),
 ('Heating', 21129),
 ('Essentials', 20275),
 ('Washer', 18449),
 ('Hair dryer', 14855),
 ('Laptop friendly workspace', 14422),
 ('Hangers', 13881),
 ('Iron', 11184),
 ('Shampoo', 10631),
 ('TV', 10092),
 ('Hot water', 9717),
 ('Family/kid friendly', 8120),
 ('Internet', 7889),
 ('Host greets you', 6656),
 ('Smoke detector', 6431),
 ('Buzzer/wireless intercom', 6091),
 ('Lock on bedroom door', 5283),
 ('Refrigerator', 5200),
 ('Free street parking', 5182),
 ('Dishes and silverware', 5070),
 ('Elevator', 4862),
 ('Bed linens', 4838),
 ('Cooking basics', 4827),
 ('Stove', 4804),
 ('Smoking allowed', 4578),
 ('Oven', 4306),
 ('translation missing: en.hosting_amenity_50', 4176),
 ('First aid kit', 3990),
 ('Cable TV', 3699)]

In [48]:
# Performing the manual equivalent of one-hot encoding for the top 20 most popular amenities.
df['Wifi'] = df['amenities'].str.contains('Wifi')
df['Kitchen'] = df['amenities'].str.contains('Kitchen')
df['Heating'] = df['amenities'].str.contains('Heating')
df['Essentials'] = df['amenities'].str.contains('Essentials')
df['Hair dryer'] = df['amenities'].str.contains('Hair dryer')
df['Laptop friendly workspace'] = df['amenities'].str.contains('Laptop friendly workspace')
df['Hangers'] = df['amenities'].str.contains('Hangers')
df['Iron'] = df['amenities'].str.contains('Iron')
df['Shampoo'] = df['amenities'].str.contains('Shampoo')
df['TV'] = df['amenities'].str.contains('TV')
df['Hot water'] = df['amenities'].str.contains('Hot water')
df['Internet'] = df['amenities'].str.contains('Internet')
df['Host greets you'] = df['amenities'].str.contains('Host greets you')
df['Smoke detector'] = df['amenities'].str.contains('Smoke detector')
df['Buzzer/wireless intercom'] = df['amenities'].str.contains('Buzzer/wireless intercom')
df['Lock on bedroom door'] = df['amenities'].str.contains('Lock on bedroom door')
df['Buzzer/wireless intercom'] = df['amenities'].str.contains('Buzzer/wireless intercom')
df['Refrigerator'] = df['amenities'].str.contains('Refrigerator')
df['Free street parking'] = df['amenities'].str.contains('Free street parking')
df['Dishes and silverware'] = df['amenities'].str.contains('Dishes and silverware')

In [49]:
df.dtypes

neighbourhood_group_cleansed     object
bathrooms                       float64
bedrooms                        float64
beds                            float64
bed_type                         object
amenities                        object
property_type                    object
room_type                        object
price                            object
cleaning_fee                     object
extra_people                     object
security_deposit                 object
Wifi                               bool
Kitchen                            bool
Heating                            bool
Essentials                         bool
Hair dryer                         bool
Laptop friendly workspace          bool
Hangers                            bool
Iron                               bool
Shampoo                            bool
TV                                 bool
Hot water                          bool
Internet                           bool
Host greets you                    bool


In [53]:
df = df.drop(columns=['amenities'])

In [50]:
# removing the dollar sign and comma, changing the type to float for price, cleaning_fee, and security_deposit
df.price = df.price.str.replace('$', '').str.replace(',', '').astype(float)
df.cleaning_fee = df.cleaning_fee.str.replace('$', '').str.replace(',', '').astype(float)
df.security_deposit = df.security_deposit.str.replace('$', '').str.replace(',', '').astype(float)
df.extra_people = df.extra_people.str.replace('$', '').str.replace(',', '').astype(float)

In [54]:
from sklearn.model_selection import train_test_split

# Split train into train & val
train, val = train_test_split(df, train_size=0.80, test_size=0.20, 
                              random_state=42)

In [55]:
target = 'price'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = train.drop(columns=target)
y_val = train[target]

In [56]:
import category_encoders as ce
# from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [62]:
pipeline = make_pipeline(
    ce.OneHotEncoder(),
    RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.9172502179602054
