# Import

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import re
from datetime import datetime

In [2]:
# import sklearn modules
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
# import gradient boosting models
import lightgbm
from lightgbm import LGBMRegressor

In [3]:
# import scipy modules
from scipy.stats import randint
from scipy.stats import uniform

In [4]:
# utils function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [82]:
# import data
_date_cols = ['host_since', 'first_review', 'last_review']
df = pd.read_csv('./data/train.csv', low_memory=False, parse_dates=_date_cols)
test_df = pd.read_csv('./data/test.csv', low_memory=False, parse_dates=_date_cols)

# Data Preprocessing

In [83]:
DF_LEN = 33538
TESTDF_LEN = 17337
# combine df and test_df for feature engineering
def combine_df(df, test_df):
    entire_df = pd.concat([df, test_df], ignore_index=True)
    return entire_df

def split(entire_df):
    df = entire_df.iloc[:DF_LEN].copy()
    test_df = entire_df.iloc[DF_LEN:].copy()
    test_df.drop('price', axis=1, inplace=True)
    return df, test_df

entire_df = combine_df(df, test_df)

In [84]:
# drop useless columns
def pre_drop_cols(df):
    useless_col = [
        'experiences_offered', 'host_acceptance_rate',
        'is_business_travel_ready', "square_feet", 'country_code'
    ]
    unique_id = ['id', 'host_id', 'name', 'host_name']
    df.drop(useless_col + unique_id, axis=1, inplace=True)
    
    return df

entire_df = pre_drop_cols(entire_df)

In [85]:
# clean extra people columns
def clean_extra_people(df):
    df['extra_people'] = df['extra_people'].str.lstrip('$').astype(float)
    return df

entire_df = clean_extra_people(entire_df)

In [86]:
# define colnames
num_cols = [
    'calculated_host_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
    'beds', 'guests_included', 'minimum_nights', 'number_of_reviews',
    'review_scores_rating', 'review_scores_cleanliness',
    'review_scores_location', 'extra_people',
]

cate_cols = [
    'host_is_superhost', 'property_type', 'room_type', 'bed_type',
    'neighbourhood_group_cleansed', 'transit', 
]

date_cols = ['host_since', 'first_review', 'last_review']
label = ['price']

In [87]:
# group by column and compute statistics
def featengi_bycol(df, colname, feat_lst): 
    gourp_cols = num_cols + [colname]
    
    if 'mean' in feat_lst:
        temp_mean = df[gourp_cols].groupby([colname]).transform(lambda x: x.mean())
        temp_mean = temp_mean.add_prefix(colname[:5]+'_mean_')
        df = pd.concat([df, temp_mean], axis=1)
        num_cols.extend(temp_mean.columns.values.tolist())
    if 'max' in feat_lst:
        temp_max = df[gourp_cols].groupby([colname]).transform(lambda x: x.max())
        temp_max = temp_max.add_prefix(colname[:5] + '_max_')
        df = pd.concat([df, temp_max], axis=1)
        num_cols.extend(temp_max.columns.values.tolist())
    return df

entire_df = featengi_bycol(entire_df, 'neighbourhood_cleansed', ['mean','max'])

In [88]:
def calculated_features(df):
    # number of accommdates per beds
    df['accom_per_beds'] = df['accommodates'] / df['beds']
    df['accom_per_beds'] = df['accom_per_beds'].apply(lambda x: 4 if x == float('inf') else x)
    if 'accom_per_beds' not in num_cols:
        num_cols.append('accom_per_beds')
        
    # number of accommdates per bath
    df['accom_per_bath'] = df['accommodates'] / df['bathrooms']
    df['accom_per_bath'] = df['accom_per_bath'].apply(lambda x: 4 if x == float('inf') else x)
    if 'accom_per_bath' not in num_cols:
        num_cols.append('accom_per_bath')
    
    # clean transit
    df['transit'] = df['transit'].str.contains('Subway|Train|train|subway|Buses|buses').astype('str')
    
    # 1 if ratio <=1 else 0
    df['accom_per_beds_ratio'] = df['accom_per_beds'].apply(lambda x: '1' if x <= 1. else '0')
    if 'accom_per_beds_ratio' not in cate_cols:
        cate_cols.append('accom_per_beds_ratio')
        
    bath_per_bed = df['bathrooms'] / df['beds']
    df['bath_per_bed'] = bath_per_bed.apply(lambda x: '1' if x >= 1. else '0')
    if 'bath_per_bed' not in cate_cols:
        cate_cols.append('bath_per_bed')
    
    # extra people
    df['extra_ppl'] = df['accommodates'] - df['guests_included']
    if 'extra_ppl' not in num_cols:
        num_cols.append('extra_ppl')
    
    # night duration
    df['nights_duration'] = df['maximum_nights'] - df['minimum_nights']
    if 'nights_duration' not in num_cols:
        num_cols.append('nights_duration')
    
    return df

entire_df = calculated_features(entire_df)

In [89]:
# fill number of review with 0
def fill_null(df):
    # fill number of review 0
    df['number_of_reviews'] = df['number_of_reviews'].fillna(0)
    return df
entire_df = fill_null(entire_df)

# fill with mean grouoby neighbourhood_group_cleansed
entire_df[num_cols] = entire_df[num_cols].fillna(
    entire_df[num_cols + ['neighbourhood_group_cleansed']].groupby(
        ['neighbourhood_group_cleansed'])[num_cols].transform('mean'))

In [90]:
# fill na in date cols
def fillna_datetime(df):
    temp = df.copy()
    for col in date_cols:
        df[col] = df[col].fillna(min(df[col]))
    return temp

entire_df = fillna_datetime(entire_df)

In [91]:
# split entire df
df, test_df = split(entire_df)

# Model training

In [92]:
# construct pipeline
numeric_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())])
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='_')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

column_transfomer = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cate_cols),
    ('date', MinMaxScaler(), date_cols),
])

In [93]:
# train test split
X = df[num_cols + cate_cols + date_cols]  
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=10)

X_train = column_transfomer.fit_transform(X_train)
X_test = column_transfomer.transform(X_test)

In [112]:
X.shape, X_train.shape

((33538, 51), (30184, 95))

In [94]:
# random search cv
def random_search_cv(reg, params):
    random_reg = RandomizedSearchCV(reg,
                              params,
                              verbose=1,
                              n_jobs=-1,
                              scoring="neg_root_mean_squared_error",
                              cv=5,
                              n_iter=10,
                              refit=True,
                              random_state=42)
    return random_reg

In [105]:
def lgbm_reg(X_train, y_train, X_test, y_test):
    from numpy.random import randint
    lgbm = LGBMRegressor(verbose=0, n_estimators=10000, learning_rate=0.01)

    params = {
        "num_leaves": np.arange(10, 50),
        'min_child_samples': np.arange(10, 50),
        'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
        'max_depth': np.arange(3, 10),
        'subsample': uniform(loc=0.2, scale=0.8),
        'colsample_bytree': uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }

    random_lgbm = random_search_cv(lgbm, params)
    random_lgbm.fit(X_train,
                    y_train,
                    eval_set=(X_test, y_test),
                    eval_metric='rmse',
                    early_stopping_rounds=100,
                    verbose=0)

    return random_lgbm

In [96]:
def eval_model(model):
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_test)
    print("RMSE")
    print("Train RMSE: " + str(rmse(y_train, y_train_pred)))
    print("Test RMSE: " + str(rmse(y_test, y_val_pred)))
    print("Train R2: " + str(r2_score(y_train, y_train_pred)))
    print("Test R2: " + str(r2_score(y_test, y_val_pred)))
    print("Best Params: "+ str(model.best_params_))
    print("Best Score: " + str(model.best_score_))

In [None]:
# train model
randSearch_lgbm = lgbm_reg(X_train, y_train, X_test, y_test)

In [98]:
# model evaluation
eval_model(randSearch_lgbm)

RMSE
Train RMSE: 68.88395264735065
Test RMSE: 73.40196036774593
Train R2: 0.7369847202229826
Test R2: 0.6345304094582379
Best Params: {'colsample_bytree': 0.5752867891211308, 'max_depth': 9, 'min_child_samples': 12, 'min_child_weight': 10.0, 'num_leaves': 16, 'reg_alpha': 5, 'reg_lambda': 0, 'subsample': 0.25204127438822366}
Best Score: -85.41377298819543


In [111]:
# average prediction 
make_submission(y_preds_final.mean(axis=1))

Unnamed: 0,Id,Predicted
0,19307997,449.282512
1,20176193,124.135834
2,19485371,59.415151
3,13079990,54.669028
4,22339757,52.376256
...,...,...
17332,22325617,66.681956
17333,8372650,298.019027
17334,3812554,94.343117
17335,18891508,56.763716


# submission

In [99]:
X_final = column_transfomer.transform(test_df[num_cols + cate_cols + date_cols])
make_submission(randSearch_lgbm.predict(X_final))

Unnamed: 0,Id,Predicted
0,19307997,444.670801
1,20176193,124.982873
2,19485371,51.713023
3,13079990,55.436246
4,22339757,52.005051
...,...,...
17332,22325617,69.390902
17333,8372650,317.897040
17334,3812554,114.830424
17335,18891508,58.494418


In [28]:
def make_submission(y_test_pred):
    from datetime import datetime
    test_df = pd.read_csv('./data/test.csv', low_memory=False)
    sub_df = pd.DataFrame({'Id': test_df['id'], 'Predicted': y_test_pred})
    sub_path = './submission/sub_'+str(datetime.now())+'.csv'
    sub_df.to_csv(sub_path, index=False)
    return sub_df