In [1]:
import pandas as pd
import numpy as np
import datetime
import ast
from sklearn.preprocessing import  MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error

In [149]:
calendar = pd.read_csv('calendar.csv')
reviews = pd.read_csv('reviews.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [150]:
data = train

In [151]:
data = data.drop(['summary', 'space', 'description', 'experiences_offered', 'host_since', 
                  'neighborhood_overview', 'notes', 'transit', 'access', 'price', 'cancellation_policy',
                  'interaction', 'house_rules', 'host_id', 'host_about', 'square_feet', 
                  'host_response_time', 'neighbourhood_cleansed', 'zipcode'], axis = 1)
Y = train.price

In [152]:
data.host_response_rate = data.host_response_rate.str.rstrip('%').astype(float)
data.host_response_rate = data.host_response_rate.fillna(50)

In [153]:
d = {'t': 1, 'f': 0}

In [154]:
data.host_is_superhost = data.host_is_superhost.map(d).fillna(0)
data.require_guest_phone_verification = data.require_guest_phone_verification.map(d).fillna(0)
data.require_guest_profile_picture = data.require_guest_profile_picture.map(d).fillna(0)
data.host_has_profile_pic = data.host_has_profile_pic.map(d).fillna(0)
data.host_identity_verified = data.host_identity_verified.map(d).fillna(0)
data.is_location_exact = data.is_location_exact.map(d).fillna(0)

In [155]:
property_type_encoder = OneHotEncoder(sparse=False).fit(train.append(test).property_type.values.reshape(-1, 1))
new_feature = property_type_encoder.transform(data.property_type.values.reshape(-1, 1))
data = data.drop(['property_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['property_type'+str(i) for i in range(new_feature.shape[1])])
data = pd.concat([data,tmp], axis = 1)

In [156]:
room_type_encoder = OneHotEncoder(sparse=False).fit(train.append(test).room_type.values.reshape(-1, 1))
new_feature = room_type_encoder.transform(data.room_type.values.reshape(-1, 1))
data = data.drop(['room_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['room_type'+str(i) for i in range(new_feature.shape[1])])
data = pd.concat([data,tmp], axis = 1)

In [157]:
bed_type_encoder = OneHotEncoder(sparse=False).fit(train.append(test).bed_type.values.reshape(-1, 1))
new_feature = bed_type_encoder.transform(data.bed_type.values.reshape(-1, 1))
data = data.drop(['bed_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['bed_type'+str(i) for i in range(new_feature.shape[1])])
data = pd.concat([data,tmp], axis = 1)

In [158]:
data.bathrooms = data.bathrooms.fillna(0)
data.bedrooms = data.bedrooms.fillna(0)
data.beds = data.beds.fillna(0)

In [159]:
data.amenities = data.amenities.replace('[{"}]', '', regex=True)
data.amenities = data.amenities.apply(lambda x: x[:].split(','))

In [160]:
amenities_encoder = MultiLabelBinarizer().fit(train.append(test).amenities)
new_feature = amenities_encoder.transform(data.amenities)
data = data.drop(['amenities'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['amenities'+str(i) for i in range(new_feature.shape[1])])
data = pd.concat([data,tmp], axis = 1)

  .format(sorted(unknown, key=str)))


In [161]:
data.security_deposit = data.security_deposit.fillna(0)
data.cleaning_fee = data.cleaning_fee.fillna(0)

In [162]:
test.name = test.name.fillna('')
train.name = train.name.fillna('')
name_encoder = TfidfVectorizer(stop_words={'english'}).fit(train.append(test))
data.name = data.name.fillna('')
name_feature = name_encoder.transform(data.name)
truncater = TruncatedSVD(n_components=30).fit(name_feature)
name_feature = truncater.transform(name_feature)
data = data.drop(['name'], axis = 1)
tmp = pd.DataFrame(name_feature, columns=['name'+str(i) for i in range(name_feature.shape[1])])
data = pd.concat([data,tmp], axis = 1)

In [163]:
days = calendar
days.available = days.available.map(d)

In [164]:
days_per_id = days.groupby(by = ['listing_id'])['available'].agg(['sum'])
days_per_id.reset_index(drop = False, inplace = True)
days_per_id = days_per_id.rename(columns={"listing_id":"id", "sum":"days"})

In [165]:
data = data.merge(days_per_id, on='id')

In [166]:
data = data.drop(['id'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(data, Y, test_size = 0.2, random_state = 42)

In [167]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    y_true = np.log(np.add(y_true,1))
    y_pred = np.log(np.add(y_pred,1))
    return mean_absolute_error(y_true, y_pred)

In [183]:
for lr in np.arange(0.05, 0.055, 0.01):
    for depth in range(20,21,1):
      for estims in range(50, 75, 10):
        model_xgb = XGBRegressor(learning_rate=lr, max_depth=depth, n_estimators=estims, nthread=-1,silent=True)
        model_xgb.fit(x_train, y_train)
        y_pred = model_xgb.predict(x_test)
        y_pred = np.where(y_pred<0, 0, y_pred)
        print(mean_absolute_percentage_error(y_test, y_pred), ' ', depth, ' ',estims, ' ', lr)

0.27566049715876884   20   50   0.05
0.27428957446315166   20   60   0.05
0.274888470601494   20   70   0.05


In [168]:
model = XGBRegressor(learning_rate=0.05, max_depth=20, n_estimators=60, nthread=-1,silent=True)
model.fit(data, Y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.05, max_delta_step=0,
             max_depth=20, min_child_weight=1, missing=None, n_estimators=60,
             n_jobs=1, nthread=-1, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1, verbosity=1)

In [169]:
data_test = test

In [170]:
data_test = data_test.drop(['summary', 'space', 'description', 'experiences_offered', 'host_since', 
                  'neighborhood_overview', 'notes', 'transit', 'access', 'cancellation_policy',
                  'interaction', 'house_rules', 'host_id', 'host_about', 'square_feet', 
                  'host_response_time', 'neighbourhood_cleansed', 'zipcode'], axis = 1)
data_test.host_response_rate = data_test.host_response_rate.str.rstrip('%').astype(float)
data_test.host_response_rate = data_test.host_response_rate.fillna(50)
data_test.host_is_superhost = data_test.host_is_superhost.map(d).fillna(0)
data_test.require_guest_phone_verification = data_test.require_guest_phone_verification.map(d).fillna(0)
data_test.require_guest_profile_picture = data_test.require_guest_profile_picture.map(d).fillna(0)
data_test.host_has_profile_pic = data_test.host_has_profile_pic.map(d).fillna(0)
data_test.host_identity_verified = data_test.host_identity_verified.map(d).fillna(0)
data_test.is_location_exact = data_test.is_location_exact.map(d).fillna(0)

new_feature = property_type_encoder.transform(data_test.property_type.values.reshape(-1, 1))
data_test = data_test.drop(['property_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['property_type'+str(i) for i in range(new_feature.shape[1])])
data_test = pd.concat([data_test,tmp], axis = 1)

new_feature = room_type_encoder.transform(data_test.room_type.values.reshape(-1, 1))
data_test = data_test.drop(['room_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['room_type'+str(i) for i in range(new_feature.shape[1])])
data_test = pd.concat([data_test,tmp], axis = 1)

new_feature = bed_type_encoder.transform(data_test.bed_type.values.reshape(-1, 1))
data_test = data_test.drop(['bed_type'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['bed_type'+str(i) for i in range(new_feature.shape[1])])
data_test = pd.concat([data_test,tmp], axis = 1)

data_test.bathrooms = data_test.bathrooms.fillna(0)
data_test.bedrooms = data_test.bedrooms.fillna(0)
data_test.beds = data_test.beds.fillna(0)

data_test.amenities = data_test.amenities.replace('[{"}]', '', regex=True)
data_test.amenities = data_test.amenities.apply(lambda x: x[:].split(','))

new_feature = amenities_encoder.transform(data_test.amenities)
data_test = data_test.drop(['amenities'], axis = 1)
tmp = pd.DataFrame(new_feature, columns=['amenities'+str(i) for i in range(new_feature.shape[1])])
data_test = pd.concat([data_test,tmp], axis = 1)

data_test.security_deposit = data_test.security_deposit.fillna(0)
data_test.cleaning_fee = data_test.cleaning_fee.fillna(0)

data_test.name = data_test.name.fillna('')
name_feature = name_encoder.transform(data_test.name)
name_feature = truncater.transform(name_feature)
data_test = data_test.drop(['name'], axis = 1)
tmp = pd.DataFrame(name_feature, columns=['name'+str(i) for i in range(name_feature.shape[1])])
data_test = pd.concat([data_test,tmp], axis = 1)

data_test = data_test.merge(days_per_id, on='id')

Y_id = data_test.id
data_test = data_test.drop(['id'], axis = 1)

  .format(sorted(unknown, key=str)))


In [171]:
Y_pred = model.predict(data_test)

In [172]:
Y_pred = np.where(Y_pred<0, 0, Y_pred)

In [173]:
tmp = pd.DataFrame(Y_pred, columns=['price'])

In [178]:
out = pd.concat([Y_id,tmp], axis = 1)

In [182]:
out.to_csv('xg_res.csv', index=False)

In [180]:
data_test

Unnamed: 0,host_response_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,latitude,longitude,is_location_exact,accommodates,bathrooms,bedrooms,...,name21,name22,name23,name24,name25,name26,name27,name28,name29,days
0,100.0,1.0,1.0,0.0,51.587767,-0.105666,0,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,329
1,75.0,0.0,1.0,0.0,51.515645,-0.314508,1,2,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,100.0,0.0,1.0,0.0,51.568017,-0.111208,1,2,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365
3,50.0,1.0,1.0,1.0,51.520982,-0.140024,1,6,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,361
4,100.0,0.0,1.0,1.0,51.472981,-0.163764,1,4,1.5,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22995,50.0,0.0,1.0,1.0,51.560545,-0.055962,1,2,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
22996,100.0,0.0,1.0,1.0,51.528993,-0.142214,0,3,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32
22997,50.0,0.0,1.0,0.0,51.466093,-0.159151,1,6,1.5,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
22998,50.0,0.0,1.0,1.0,51.626713,-0.129613,1,5,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
