# Import

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import re
from datetime import datetime

In [2]:
# import sklearn modules
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
# import gradient boosting models
import lightgbm
from lightgbm import LGBMRegressor

In [3]:
# import scipy modules
from scipy.stats import randint
from scipy.stats import uniform

In [4]:
# utils function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [82]:
# import data
_date_cols = ['host_since', 'first_review', 'last_review']
df = pd.read_csv('./data/train.csv', low_memory=False, parse_dates=_date_cols)
test_df = pd.read_csv('./data/test.csv', low_memory=False, parse_dates=_date_cols)

# Data Preprocessing

In [83]:
DF_LEN = 33538
TESTDF_LEN = 17337
# combine df and test_df for feature engineering
def combine_df(df, test_df):
    entire_df = pd.concat([df, test_df], ignore_index=True)
    return entire_df

def split(entire_df):
    df = entire_df.iloc[:DF_LEN].copy()
    test_df = entire_df.iloc[DF_LEN:].copy()
    test_df.drop('price', axis=1, inplace=True)
    return df, test_df

entire_df = combine_df(df, test_df)

In [84]:
# drop useless columns
def pre_drop_cols(df):
    useless_col = [
        'experiences_offered', 'host_acceptance_rate',
        'is_business_travel_ready', "square_feet", 'country_code'
    ]
    unique_id = ['id', 'host_id', 'name', 'host_name']
    df.drop(useless_col + unique_id, axis=1, inplace=True)
    
    return df

entire_df = pre_drop_cols(entire_df)

In [85]:
# clean extra people columns
def clean_extra_people(df):
    df['extra_people'] = df['extra_people'].str.lstrip('$').astype(float)
    return df

entire_df = clean_extra_people(entire_df)

In [86]:
# define colnames
num_cols = [
    'calculated_host_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
    'beds', 'guests_included', 'minimum_nights', 'number_of_reviews',
    'review_scores_rating', 'review_scores_cleanliness',
    'review_scores_location', 'extra_people',
]

cate_cols = [
    'host_is_superhost', 'property_type', 'room_type', 'bed_type',
    'neighbourhood_group_cleansed', 'transit', 
]

date_cols = ['host_since', 'first_review', 'last_review']
label = ['price']

In [87]:
# group by column and compute statistics
def featengi_bycol(df, colname, feat_lst): 
    gourp_cols = num_cols + [colname]
    
    if 'mean' in feat_lst:
        temp_mean = df[gourp_cols].groupby([colname]).transform(lambda x: x.mean())
        temp_mean = temp_mean.add_prefix(colname[:5]+'_mean_')
        df = pd.concat([df, temp_mean], axis=1)
        num_cols.extend(temp_mean.columns.values.tolist())
    if 'max' in feat_lst:
        temp_max = df[gourp_cols].groupby([colname]).transform(lambda x: x.max())
        temp_max = temp_max.add_prefix(colname[:5] + '_max_')
        df = pd.concat([df, temp_max], axis=1)
        num_cols.extend(temp_max.columns.values.tolist())
    return df

entire_df = featengi_bycol(entire_df, 'neighbourhood_cleansed', ['mean','max'])

In [88]:
def calculated_features(df):
    # number of accommdates per beds
    df['accom_per_beds'] = df['accommodates'] / df['beds']
    df['accom_per_beds'] = df['accom_per_beds'].apply(lambda x: 4 if x == float('inf') else x)
    if 'accom_per_beds' not in num_cols:
        num_cols.append('accom_per_beds')
        
    # number of accommdates per bath
    df['accom_per_bath'] = df['accommodates'] / df['bathrooms']
    df['accom_per_bath'] = df['accom_per_bath'].apply(lambda x: 4 if x == float('inf') else x)
    if 'accom_per_bath' not in num_cols:
        num_cols.append('accom_per_bath')
    
    # clean transit
    df['transit'] = df['transit'].str.contains('Subway|Train|train|subway|Buses|buses').astype('str')
    
    # 1 if ratio <=1 else 0
    df['accom_per_beds_ratio'] = df['accom_per_beds'].apply(lambda x: '1' if x <= 1. else '0')
    if 'accom_per_beds_ratio' not in cate_cols:
        cate_cols.append('accom_per_beds_ratio')
        
    bath_per_bed = df['bathrooms'] / df['beds']
    df['bath_per_bed'] = bath_per_bed.apply(lambda x: '1' if x >= 1. else '0')
    if 'bath_per_bed' not in cate_cols:
        cate_cols.append('bath_per_bed')
    
    # extra people
    df['extra_ppl'] = df['accommodates'] - df['guests_included']
    if 'extra_ppl' not in num_cols:
        num_cols.append('extra_ppl')
    
    # night duration
    df['nights_duration'] = df['maximum_nights'] - df['minimum_nights']
    if 'nights_duration' not in num_cols:
        num_cols.append('nights_duration')
    
    return df

entire_df = calculated_features(entire_df)

In [89]:
# fill number of review with 0
def fill_null(df):
    # fill number of review 0
    df['number_of_reviews'] = df['number_of_reviews'].fillna(0)
    return df
entire_df = fill_null(entire_df)

# fill with mean grouoby neighbourhood_group_cleansed
entire_df[num_cols] = entire_df[num_cols].fillna(
    entire_df[num_cols + ['neighbourhood_group_cleansed']].groupby(
        ['neighbourhood_group_cleansed'])[num_cols].transform('mean'))

In [90]:
# fill na in date cols
def fillna_datetime(df):
    temp = df.copy()
    for col in date_cols:
        df[col] = df[col].fillna(min(df[col]))
    return temp

entire_df = fillna_datetime(entire_df)

In [91]:
# split entire df
df, test_df = split(entire_df)

# Model training

In [92]:
# construct pipeline
numeric_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())])
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='_')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

column_transfomer = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cate_cols),
    ('date', MinMaxScaler(), date_cols),
])

In [93]:
# train test split
X = df[num_cols + cate_cols + date_cols]  
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=10)

X_train = column_transfomer.fit_transform(X_train)
X_test = column_transfomer.transform(X_test)

In [112]:
X.shape, X_train.shape

((33538, 51), (30184, 95))

In [94]:
# random search cv
def random_search_cv(reg, params):
    random_reg = RandomizedSearchCV(reg,
                              params,
                              verbose=1,
                              n_jobs=-1,
                              scoring="neg_root_mean_squared_error",
                              cv=5,
                              n_iter=10,
                              refit=True,
                              random_state=42)
    return random_reg

In [105]:
def lgbm_reg(X_train, y_train, X_test, y_test):
    from numpy.random import randint
    lgbm = LGBMRegressor(verbose=0, n_estimators=10000, learning_rate=0.01)

    params = {
        "num_leaves": np.arange(10, 50),
        'min_child_samples': np.arange(10, 50),
        'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3],
        'max_depth': np.arange(3, 10),
        'subsample': uniform(loc=0.2, scale=0.8),
        'colsample_bytree': uniform(loc=0.4, scale=0.6),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
    }

    random_lgbm = random_search_cv(lgbm, params)
    random_lgbm.fit(X_train,
                    y_train,
                    eval_set=(X_test, y_test),
                    eval_metric='rmse',
                    early_stopping_rounds=100,
                    verbose=0)

    return random_lgbm

In [96]:
def eval_model(model):
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_test)
    print("RMSE")
    print("Train RMSE: " + str(rmse(y_train, y_train_pred)))
    print("Test RMSE: " + str(rmse(y_test, y_val_pred)))
    print("Train R2: " + str(r2_score(y_train, y_train_pred)))
    print("Test R2: " + str(r2_score(y_test, y_val_pred)))
    print("Best Params: "+ str(model.best_params_))
    print("Best Score: " + str(model.best_score_))

In [97]:
# train model
randSearch_lgbm = lgbm_reg(X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.7min finished


[1]	valid_0's rmse: 120.86	valid_0's l2: 14607.1
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 120.262	valid_0's l2: 14462.8
[3]	valid_0's rmse: 119.71	valid_0's l2: 14330.4
[4]	valid_0's rmse: 119.151	valid_0's l2: 14197.1
[5]	valid_0's rmse: 118.642	valid_0's l2: 14075.9
[6]	valid_0's rmse: 118.046	valid_0's l2: 13934.8
[7]	valid_0's rmse: 117.513	valid_0's l2: 13809.3
[8]	valid_0's rmse: 117.035	valid_0's l2: 13697.3
[9]	valid_0's rmse: 116.479	valid_0's l2: 13567.4
[10]	valid_0's rmse: 115.971	valid_0's l2: 13449.2
[11]	valid_0's rmse: 115.432	valid_0's l2: 13324.7
[12]	valid_0's rmse: 114.92	valid_0's l2: 13206.6
[13]	valid_0's rmse: 114.411	valid_0's l2: 13089.8
[14]	valid_0's rmse: 113.928	valid_0's l2: 12979.5
[15]	valid_0's rmse: 113.43	valid_0's l2: 12866.5
[16]	valid_0's rmse: 112.937	valid_0's l2: 12754.8
[17]	valid_0's rmse: 112.554	valid_0's l2: 12668.4
[18]	valid_0's rmse: 112.062	valid_0's l2: 12557.9
[19]	valid_0's rmse: 111.574	val

[220]	valid_0's rmse: 79.829	valid_0's l2: 6372.66
[221]	valid_0's rmse: 79.7992	valid_0's l2: 6367.91
[222]	valid_0's rmse: 79.7786	valid_0's l2: 6364.62
[223]	valid_0's rmse: 79.7522	valid_0's l2: 6360.41
[224]	valid_0's rmse: 79.7241	valid_0's l2: 6355.94
[225]	valid_0's rmse: 79.6865	valid_0's l2: 6349.94
[226]	valid_0's rmse: 79.6533	valid_0's l2: 6344.65
[227]	valid_0's rmse: 79.6233	valid_0's l2: 6339.86
[228]	valid_0's rmse: 79.5981	valid_0's l2: 6335.85
[229]	valid_0's rmse: 79.5556	valid_0's l2: 6329.1
[230]	valid_0's rmse: 79.5263	valid_0's l2: 6324.43
[231]	valid_0's rmse: 79.49	valid_0's l2: 6318.66
[232]	valid_0's rmse: 79.4699	valid_0's l2: 6315.47
[233]	valid_0's rmse: 79.438	valid_0's l2: 6310.39
[234]	valid_0's rmse: 79.4052	valid_0's l2: 6305.18
[235]	valid_0's rmse: 79.3782	valid_0's l2: 6300.9
[236]	valid_0's rmse: 79.3462	valid_0's l2: 6295.81
[237]	valid_0's rmse: 79.3261	valid_0's l2: 6292.63
[238]	valid_0's rmse: 79.2972	valid_0's l2: 6288.04
[239]	valid_0's rm

[385]	valid_0's rmse: 76.5799	valid_0's l2: 5864.48
[386]	valid_0's rmse: 76.5714	valid_0's l2: 5863.18
[387]	valid_0's rmse: 76.5763	valid_0's l2: 5863.93
[388]	valid_0's rmse: 76.5686	valid_0's l2: 5862.76
[389]	valid_0's rmse: 76.5621	valid_0's l2: 5861.75
[390]	valid_0's rmse: 76.5526	valid_0's l2: 5860.29
[391]	valid_0's rmse: 76.5433	valid_0's l2: 5858.88
[392]	valid_0's rmse: 76.54	valid_0's l2: 5858.37
[393]	valid_0's rmse: 76.5204	valid_0's l2: 5855.37
[394]	valid_0's rmse: 76.5086	valid_0's l2: 5853.56
[395]	valid_0's rmse: 76.5006	valid_0's l2: 5852.35
[396]	valid_0's rmse: 76.4923	valid_0's l2: 5851.06
[397]	valid_0's rmse: 76.4848	valid_0's l2: 5849.93
[398]	valid_0's rmse: 76.4721	valid_0's l2: 5847.99
[399]	valid_0's rmse: 76.4654	valid_0's l2: 5846.95
[400]	valid_0's rmse: 76.4461	valid_0's l2: 5844
[401]	valid_0's rmse: 76.4412	valid_0's l2: 5843.26
[402]	valid_0's rmse: 76.4301	valid_0's l2: 5841.57
[403]	valid_0's rmse: 76.4223	valid_0's l2: 5840.37
[404]	valid_0's r

[640]	valid_0's rmse: 75.1206	valid_0's l2: 5643.1
[641]	valid_0's rmse: 75.119	valid_0's l2: 5642.86
[642]	valid_0's rmse: 75.1124	valid_0's l2: 5641.87
[643]	valid_0's rmse: 75.1085	valid_0's l2: 5641.29
[644]	valid_0's rmse: 75.1075	valid_0's l2: 5641.14
[645]	valid_0's rmse: 75.1071	valid_0's l2: 5641.07
[646]	valid_0's rmse: 75.1015	valid_0's l2: 5640.23
[647]	valid_0's rmse: 75.1008	valid_0's l2: 5640.13
[648]	valid_0's rmse: 75.1	valid_0's l2: 5640.02
[649]	valid_0's rmse: 75.1015	valid_0's l2: 5640.24
[650]	valid_0's rmse: 75.0916	valid_0's l2: 5638.74
[651]	valid_0's rmse: 75.0885	valid_0's l2: 5638.28
[652]	valid_0's rmse: 75.088	valid_0's l2: 5638.2
[653]	valid_0's rmse: 75.0871	valid_0's l2: 5638.08
[654]	valid_0's rmse: 75.0817	valid_0's l2: 5637.26
[655]	valid_0's rmse: 75.0739	valid_0's l2: 5636.08
[656]	valid_0's rmse: 75.0684	valid_0's l2: 5635.26
[657]	valid_0's rmse: 75.061	valid_0's l2: 5634.16
[658]	valid_0's rmse: 75.0605	valid_0's l2: 5634.07
[659]	valid_0's rmse

[803]	valid_0's rmse: 74.6701	valid_0's l2: 5575.63
[804]	valid_0's rmse: 74.6698	valid_0's l2: 5575.58
[805]	valid_0's rmse: 74.6656	valid_0's l2: 5574.95
[806]	valid_0's rmse: 74.6588	valid_0's l2: 5573.94
[807]	valid_0's rmse: 74.655	valid_0's l2: 5573.36
[808]	valid_0's rmse: 74.6567	valid_0's l2: 5573.62
[809]	valid_0's rmse: 74.6543	valid_0's l2: 5573.26
[810]	valid_0's rmse: 74.6489	valid_0's l2: 5572.47
[811]	valid_0's rmse: 74.6504	valid_0's l2: 5572.68
[812]	valid_0's rmse: 74.6469	valid_0's l2: 5572.16
[813]	valid_0's rmse: 74.6497	valid_0's l2: 5572.57
[814]	valid_0's rmse: 74.6485	valid_0's l2: 5572.4
[815]	valid_0's rmse: 74.6513	valid_0's l2: 5572.81
[816]	valid_0's rmse: 74.6484	valid_0's l2: 5572.39
[817]	valid_0's rmse: 74.6456	valid_0's l2: 5571.97
[818]	valid_0's rmse: 74.646	valid_0's l2: 5572.03
[819]	valid_0's rmse: 74.6463	valid_0's l2: 5572.07
[820]	valid_0's rmse: 74.6433	valid_0's l2: 5571.62
[821]	valid_0's rmse: 74.6407	valid_0's l2: 5571.23
[822]	valid_0's

[1009]	valid_0's rmse: 74.4635	valid_0's l2: 5544.81
[1010]	valid_0's rmse: 74.4622	valid_0's l2: 5544.62
[1011]	valid_0's rmse: 74.4616	valid_0's l2: 5544.53
[1012]	valid_0's rmse: 74.4601	valid_0's l2: 5544.3
[1013]	valid_0's rmse: 74.4535	valid_0's l2: 5543.32
[1014]	valid_0's rmse: 74.454	valid_0's l2: 5543.4
[1015]	valid_0's rmse: 74.4509	valid_0's l2: 5542.94
[1016]	valid_0's rmse: 74.4487	valid_0's l2: 5542.61
[1017]	valid_0's rmse: 74.4482	valid_0's l2: 5542.53
[1018]	valid_0's rmse: 74.4488	valid_0's l2: 5542.63
[1019]	valid_0's rmse: 74.4497	valid_0's l2: 5542.75
[1020]	valid_0's rmse: 74.4477	valid_0's l2: 5542.46
[1021]	valid_0's rmse: 74.4456	valid_0's l2: 5542.14
[1022]	valid_0's rmse: 74.4433	valid_0's l2: 5541.8
[1023]	valid_0's rmse: 74.443	valid_0's l2: 5541.76
[1024]	valid_0's rmse: 74.4424	valid_0's l2: 5541.67
[1025]	valid_0's rmse: 74.4442	valid_0's l2: 5541.93
[1026]	valid_0's rmse: 74.4434	valid_0's l2: 5541.82
[1027]	valid_0's rmse: 74.4404	valid_0's l2: 5541.3

[1259]	valid_0's rmse: 74.288	valid_0's l2: 5518.71
[1260]	valid_0's rmse: 74.286	valid_0's l2: 5518.41
[1261]	valid_0's rmse: 74.2862	valid_0's l2: 5518.44
[1262]	valid_0's rmse: 74.2852	valid_0's l2: 5518.28
[1263]	valid_0's rmse: 74.2845	valid_0's l2: 5518.19
[1264]	valid_0's rmse: 74.2829	valid_0's l2: 5517.94
[1265]	valid_0's rmse: 74.2817	valid_0's l2: 5517.78
[1266]	valid_0's rmse: 74.2792	valid_0's l2: 5517.4
[1267]	valid_0's rmse: 74.2801	valid_0's l2: 5517.53
[1268]	valid_0's rmse: 74.2724	valid_0's l2: 5516.39
[1269]	valid_0's rmse: 74.2697	valid_0's l2: 5515.99
[1270]	valid_0's rmse: 74.2675	valid_0's l2: 5515.66
[1271]	valid_0's rmse: 74.2673	valid_0's l2: 5515.63
[1272]	valid_0's rmse: 74.2665	valid_0's l2: 5515.51
[1273]	valid_0's rmse: 74.2624	valid_0's l2: 5514.91
[1274]	valid_0's rmse: 74.262	valid_0's l2: 5514.85
[1275]	valid_0's rmse: 74.2625	valid_0's l2: 5514.92
[1276]	valid_0's rmse: 74.262	valid_0's l2: 5514.85
[1277]	valid_0's rmse: 74.2628	valid_0's l2: 5514.9

[1519]	valid_0's rmse: 74.0739	valid_0's l2: 5486.94
[1520]	valid_0's rmse: 74.0741	valid_0's l2: 5486.97
[1521]	valid_0's rmse: 74.071	valid_0's l2: 5486.51
[1522]	valid_0's rmse: 74.0702	valid_0's l2: 5486.39
[1523]	valid_0's rmse: 74.0707	valid_0's l2: 5486.46
[1524]	valid_0's rmse: 74.069	valid_0's l2: 5486.22
[1525]	valid_0's rmse: 74.068	valid_0's l2: 5486.07
[1526]	valid_0's rmse: 74.0666	valid_0's l2: 5485.87
[1527]	valid_0's rmse: 74.0666	valid_0's l2: 5485.86
[1528]	valid_0's rmse: 74.0613	valid_0's l2: 5485.08
[1529]	valid_0's rmse: 74.0615	valid_0's l2: 5485.1
[1530]	valid_0's rmse: 74.0616	valid_0's l2: 5485.13
[1531]	valid_0's rmse: 74.0604	valid_0's l2: 5484.95
[1532]	valid_0's rmse: 74.0583	valid_0's l2: 5484.63
[1533]	valid_0's rmse: 74.0587	valid_0's l2: 5484.69
[1534]	valid_0's rmse: 74.0567	valid_0's l2: 5484.39
[1535]	valid_0's rmse: 74.0566	valid_0's l2: 5484.38
[1536]	valid_0's rmse: 74.056	valid_0's l2: 5484.29
[1537]	valid_0's rmse: 74.0531	valid_0's l2: 5483.8

[1783]	valid_0's rmse: 73.9288	valid_0's l2: 5465.47
[1784]	valid_0's rmse: 73.93	valid_0's l2: 5465.65
[1785]	valid_0's rmse: 73.9289	valid_0's l2: 5465.49
[1786]	valid_0's rmse: 73.9301	valid_0's l2: 5465.66
[1787]	valid_0's rmse: 73.9292	valid_0's l2: 5465.52
[1788]	valid_0's rmse: 73.9349	valid_0's l2: 5466.36
[1789]	valid_0's rmse: 73.9338	valid_0's l2: 5466.2
[1790]	valid_0's rmse: 73.9329	valid_0's l2: 5466.07
[1791]	valid_0's rmse: 73.9332	valid_0's l2: 5466.12
[1792]	valid_0's rmse: 73.9326	valid_0's l2: 5466.03
[1793]	valid_0's rmse: 73.9307	valid_0's l2: 5465.74
[1794]	valid_0's rmse: 73.931	valid_0's l2: 5465.79
[1795]	valid_0's rmse: 73.9298	valid_0's l2: 5465.62
[1796]	valid_0's rmse: 73.9298	valid_0's l2: 5465.61
[1797]	valid_0's rmse: 73.9297	valid_0's l2: 5465.6
[1798]	valid_0's rmse: 73.9283	valid_0's l2: 5465.39
[1799]	valid_0's rmse: 73.9278	valid_0's l2: 5465.32
[1800]	valid_0's rmse: 73.9232	valid_0's l2: 5464.64
[1801]	valid_0's rmse: 73.9248	valid_0's l2: 5464.8

[2077]	valid_0's rmse: 73.8031	valid_0's l2: 5446.9
[2078]	valid_0's rmse: 73.8025	valid_0's l2: 5446.81
[2079]	valid_0's rmse: 73.8021	valid_0's l2: 5446.75
[2080]	valid_0's rmse: 73.801	valid_0's l2: 5446.59
[2081]	valid_0's rmse: 73.801	valid_0's l2: 5446.59
[2082]	valid_0's rmse: 73.8013	valid_0's l2: 5446.63
[2083]	valid_0's rmse: 73.8	valid_0's l2: 5446.44
[2084]	valid_0's rmse: 73.8002	valid_0's l2: 5446.48
[2085]	valid_0's rmse: 73.7977	valid_0's l2: 5446.09
[2086]	valid_0's rmse: 73.7957	valid_0's l2: 5445.8
[2087]	valid_0's rmse: 73.7944	valid_0's l2: 5445.61
[2088]	valid_0's rmse: 73.7951	valid_0's l2: 5445.71
[2089]	valid_0's rmse: 73.7962	valid_0's l2: 5445.88
[2090]	valid_0's rmse: 73.7964	valid_0's l2: 5445.92
[2091]	valid_0's rmse: 73.7957	valid_0's l2: 5445.8
[2092]	valid_0's rmse: 73.7902	valid_0's l2: 5445
[2093]	valid_0's rmse: 73.7901	valid_0's l2: 5444.97
[2094]	valid_0's rmse: 73.79	valid_0's l2: 5444.96
[2095]	valid_0's rmse: 73.7854	valid_0's l2: 5444.28
[2096]

[2308]	valid_0's rmse: 73.6939	valid_0's l2: 5430.78
[2309]	valid_0's rmse: 73.6928	valid_0's l2: 5430.63
[2310]	valid_0's rmse: 73.6934	valid_0's l2: 5430.72
[2311]	valid_0's rmse: 73.6935	valid_0's l2: 5430.73
[2312]	valid_0's rmse: 73.6925	valid_0's l2: 5430.59
[2313]	valid_0's rmse: 73.6982	valid_0's l2: 5431.43
[2314]	valid_0's rmse: 73.6983	valid_0's l2: 5431.44
[2315]	valid_0's rmse: 73.6984	valid_0's l2: 5431.45
[2316]	valid_0's rmse: 73.698	valid_0's l2: 5431.4
[2317]	valid_0's rmse: 73.6979	valid_0's l2: 5431.39
[2318]	valid_0's rmse: 73.6974	valid_0's l2: 5431.3
[2319]	valid_0's rmse: 73.6975	valid_0's l2: 5431.32
[2320]	valid_0's rmse: 73.6959	valid_0's l2: 5431.08
[2321]	valid_0's rmse: 73.6994	valid_0's l2: 5431.6
[2322]	valid_0's rmse: 73.7004	valid_0's l2: 5431.74
[2323]	valid_0's rmse: 73.7004	valid_0's l2: 5431.76
[2324]	valid_0's rmse: 73.6985	valid_0's l2: 5431.47
[2325]	valid_0's rmse: 73.6976	valid_0's l2: 5431.34
[2326]	valid_0's rmse: 73.6986	valid_0's l2: 5431.

[2569]	valid_0's rmse: 73.616	valid_0's l2: 5419.31
[2570]	valid_0's rmse: 73.615	valid_0's l2: 5419.17
[2571]	valid_0's rmse: 73.6157	valid_0's l2: 5419.28
[2572]	valid_0's rmse: 73.6168	valid_0's l2: 5419.43
[2573]	valid_0's rmse: 73.6171	valid_0's l2: 5419.47
[2574]	valid_0's rmse: 73.6166	valid_0's l2: 5419.41
[2575]	valid_0's rmse: 73.6173	valid_0's l2: 5419.51
[2576]	valid_0's rmse: 73.6185	valid_0's l2: 5419.68
[2577]	valid_0's rmse: 73.6194	valid_0's l2: 5419.82
[2578]	valid_0's rmse: 73.6188	valid_0's l2: 5419.73
[2579]	valid_0's rmse: 73.6205	valid_0's l2: 5419.98
[2580]	valid_0's rmse: 73.6217	valid_0's l2: 5420.16
[2581]	valid_0's rmse: 73.6209	valid_0's l2: 5420.03
[2582]	valid_0's rmse: 73.6222	valid_0's l2: 5420.23
[2583]	valid_0's rmse: 73.6215	valid_0's l2: 5420.13
[2584]	valid_0's rmse: 73.6195	valid_0's l2: 5419.83
[2585]	valid_0's rmse: 73.6193	valid_0's l2: 5419.8
[2586]	valid_0's rmse: 73.6186	valid_0's l2: 5419.7
[2587]	valid_0's rmse: 73.6184	valid_0's l2: 5419.

[2851]	valid_0's rmse: 73.5173	valid_0's l2: 5404.79
[2852]	valid_0's rmse: 73.5075	valid_0's l2: 5403.35
[2853]	valid_0's rmse: 73.5077	valid_0's l2: 5403.39
[2854]	valid_0's rmse: 73.5072	valid_0's l2: 5403.3
[2855]	valid_0's rmse: 73.5066	valid_0's l2: 5403.22
[2856]	valid_0's rmse: 73.5081	valid_0's l2: 5403.44
[2857]	valid_0's rmse: 73.508	valid_0's l2: 5403.42
[2858]	valid_0's rmse: 73.5081	valid_0's l2: 5403.44
[2859]	valid_0's rmse: 73.5073	valid_0's l2: 5403.32
[2860]	valid_0's rmse: 73.5078	valid_0's l2: 5403.4
[2861]	valid_0's rmse: 73.5079	valid_0's l2: 5403.41
[2862]	valid_0's rmse: 73.5069	valid_0's l2: 5403.26
[2863]	valid_0's rmse: 73.5068	valid_0's l2: 5403.25
[2864]	valid_0's rmse: 73.5081	valid_0's l2: 5403.44
[2865]	valid_0's rmse: 73.5099	valid_0's l2: 5403.7
[2866]	valid_0's rmse: 73.5116	valid_0's l2: 5403.95
[2867]	valid_0's rmse: 73.5117	valid_0's l2: 5403.97
[2868]	valid_0's rmse: 73.5113	valid_0's l2: 5403.91
[2869]	valid_0's rmse: 73.511	valid_0's l2: 5403.8

[3090]	valid_0's rmse: 73.4646	valid_0's l2: 5397.04
[3091]	valid_0's rmse: 73.4649	valid_0's l2: 5397.09
[3092]	valid_0's rmse: 73.4647	valid_0's l2: 5397.07
[3093]	valid_0's rmse: 73.4641	valid_0's l2: 5396.97
[3094]	valid_0's rmse: 73.4626	valid_0's l2: 5396.75
[3095]	valid_0's rmse: 73.4616	valid_0's l2: 5396.61
[3096]	valid_0's rmse: 73.4621	valid_0's l2: 5396.69
[3097]	valid_0's rmse: 73.4628	valid_0's l2: 5396.78
[3098]	valid_0's rmse: 73.4634	valid_0's l2: 5396.87
[3099]	valid_0's rmse: 73.4567	valid_0's l2: 5395.89
[3100]	valid_0's rmse: 73.4585	valid_0's l2: 5396.15
[3101]	valid_0's rmse: 73.4584	valid_0's l2: 5396.13
[3102]	valid_0's rmse: 73.4563	valid_0's l2: 5395.83
[3103]	valid_0's rmse: 73.4549	valid_0's l2: 5395.62
[3104]	valid_0's rmse: 73.4562	valid_0's l2: 5395.82
[3105]	valid_0's rmse: 73.4535	valid_0's l2: 5395.42
[3106]	valid_0's rmse: 73.4534	valid_0's l2: 5395.41
[3107]	valid_0's rmse: 73.4543	valid_0's l2: 5395.53
[3108]	valid_0's rmse: 73.4539	valid_0's l2: 5

[3375]	valid_0's rmse: 73.4029	valid_0's l2: 5387.98
[3376]	valid_0's rmse: 73.4036	valid_0's l2: 5388.08
[3377]	valid_0's rmse: 73.4039	valid_0's l2: 5388.13
[3378]	valid_0's rmse: 73.4069	valid_0's l2: 5388.57
[3379]	valid_0's rmse: 73.4063	valid_0's l2: 5388.49
[3380]	valid_0's rmse: 73.4064	valid_0's l2: 5388.5
[3381]	valid_0's rmse: 73.4084	valid_0's l2: 5388.8
[3382]	valid_0's rmse: 73.4078	valid_0's l2: 5388.71
[3383]	valid_0's rmse: 73.4074	valid_0's l2: 5388.65
[3384]	valid_0's rmse: 73.4066	valid_0's l2: 5388.52
[3385]	valid_0's rmse: 73.4075	valid_0's l2: 5388.65
[3386]	valid_0's rmse: 73.4077	valid_0's l2: 5388.69
[3387]	valid_0's rmse: 73.4077	valid_0's l2: 5388.69
[3388]	valid_0's rmse: 73.4086	valid_0's l2: 5388.83
[3389]	valid_0's rmse: 73.4078	valid_0's l2: 5388.71
[3390]	valid_0's rmse: 73.4056	valid_0's l2: 5388.38
[3391]	valid_0's rmse: 73.4057	valid_0's l2: 5388.4
[3392]	valid_0's rmse: 73.4074	valid_0's l2: 5388.64
[3393]	valid_0's rmse: 73.4067	valid_0's l2: 5388

In [98]:
# model evaluation
eval_model(randSearch_lgbm)

RMSE
Train RMSE: 68.88395264735065
Test RMSE: 73.40196036774593
Train R2: 0.7369847202229826
Test R2: 0.6345304094582379
Best Params: {'colsample_bytree': 0.5752867891211308, 'max_depth': 9, 'min_child_samples': 12, 'min_child_weight': 10.0, 'num_leaves': 16, 'reg_alpha': 5, 'reg_lambda': 0, 'subsample': 0.25204127438822366}
Best Score: -85.41377298819543


In [111]:
# average prediction 
make_submission(y_preds_final.mean(axis=1))

Unnamed: 0,Id,Predicted
0,19307997,449.282512
1,20176193,124.135834
2,19485371,59.415151
3,13079990,54.669028
4,22339757,52.376256
...,...,...
17332,22325617,66.681956
17333,8372650,298.019027
17334,3812554,94.343117
17335,18891508,56.763716


# submission

In [99]:
X_final = column_transfomer.transform(test_df[num_cols + cate_cols + date_cols])
make_submission(randSearch_lgbm.predict(X_final))

Unnamed: 0,Id,Predicted
0,19307997,444.670801
1,20176193,124.982873
2,19485371,51.713023
3,13079990,55.436246
4,22339757,52.005051
...,...,...
17332,22325617,69.390902
17333,8372650,317.897040
17334,3812554,114.830424
17335,18891508,58.494418


In [28]:
def make_submission(y_test_pred):
    from datetime import datetime
    test_df = pd.read_csv('./data/test.csv', low_memory=False)
    sub_df = pd.DataFrame({'Id': test_df['id'], 'Predicted': y_test_pred})
    sub_path = './submission/sub_'+str(datetime.now())+'.csv'
    sub_df.to_csv(sub_path, index=False)
    return sub_df