### [Regression Data Analysis]
# KaKR House Price Prediction - Boost Models Hyper Parameter Tuning

- 이전에서 확인했던 가장 성능이 좋은 **SUBMISSION #6** 에서의 feature를 이용해, 하이퍼파라미터 튜닝을 해봅니다.


- Grid Search, Coarse Search, Finer Search 등 여러가지 방법을 이용해 봅니다. 

---

#### Import Modules

In [31]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor, plot_importance
from lightgbm import LGBMRegressor, plot_importance

from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import skew

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline

#### Data Load

In [32]:
#### Data Loadrow_data = pd.read_csv("./data/train.csv", index_col=['id'])
row_test = pd.read_csv("./data/test.csv", index_col=['id'])

row_data.shape, row_test.shape

((15035, 20), (6468, 19))

# Utility Functions

- 이전에 만들었던 여러 함수를 활용합니다.

In [33]:
def feature_preprocess(row_data, zipcode_rank_dict):
    
    data = row_data.copy()
    
    # price log transformation
    if 'price' in data.columns:
        data['log_price'] = np.log1p(data.price)
    
    # Date parsing
    data['date'] = data['date'].apply(lambda i: i[:8])
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data.date.dt.year
    data['month'] = data.date.dt.month
    data['day']= data.date.dt.day
    data['year_month'] = data.date.dt.year * 100 + data.date.dt.month
    data['day_of_week'] = data.date.dt.dayofweek
    data['2015_or_not'] = data.year.apply(lambda i: 0 if i == 2014 else 1)
    data['weekday_ohe'] = data.day_of_week.apply(lambda i: 1 if i in range(5) else 0)
    data['saturday_ohe'] = data.day_of_week.apply(lambda i: 1 if i == 5 else 0)
    data['sunday_ohe'] = data.day_of_week.apply(lambda i: 1 if i == 6 else 0)
    
    # yr_renovated
    data['yr_renovated_ohe'] = data['yr_renovated'].apply(lambda i: 0 if i == 0 else 1)
    
    # zipcode : 위에서 zipcode_rank_dict를 만들어놔야 코드가 실행됨
    data['zipcode_rank'] = 71 - data['zipcode'].apply(lambda i: zipcode_rank_dict[i])
    
    # lat / long
    data['lat_scale'] = (data['lat'] - data['lat'].mean()) / data['lat'].std()
    data['long_scale'] = (data['long'] - data['long'].mean()) / data['long'].std()
    
    # areas
    data['sqft_living_scale'] = np.log1p(data['sqft_living'])
    data['sqft_lot_scale'] = np.log1p(data['sqft_lot'])
    data['sqft_living_diff'] = data['sqft_living15'] - data['sqft_living']
    data['sqft_lot_diff'] = data['sqft_lot15'] - data['sqft_lot']
    data.loc[data['sqft_living_diff'] < 0, 'sqft_living_diff_scale'] = -np.log1p(-data['sqft_living_diff'])
    data.loc[data['sqft_living_diff'] >= 0, 'sqft_living_diff_scale'] = np.log1p(data['sqft_living_diff'])
    data.loc[data['sqft_lot_diff'] < 0, 'sqft_lot_diff_scale'] = -np.log1p(-data['sqft_lot_diff'])
    data.loc[data['sqft_lot_diff'] >= 0, 'sqft_lot_diff_scale'] = np.log1p(data['sqft_lot_diff'])
    data['sqft_above_scale'] = np.log1p(data['sqft_above'])
    data['sqft_basement_ohe'] = data.sqft_basement.apply(lambda i: 0 if i == 0 else 1)
    
    return data

In [34]:
def get_ziprank(data):
    zipcode_rank = data.groupby(['zipcode']).mean().sort_values(by=['price'], \
                                                                ascending=False).reset_index()[['zipcode', 'price']]
    zipcode_rank['rank'] = range(1, len(zipcode_rank)+1)
    zipcode_rank_dict = zipcode_rank.set_index(['zipcode'])['rank'].to_dict()
    
    return zipcode_rank_dict

In [35]:
zipcode_rank_dict = get_ziprank(row_data)

In [36]:
data = feature_preprocess(row_data, zipcode_rank_dict)
test = feature_preprocess(row_test, zipcode_rank_dict)

data.shape, test.shape

((15035, 42), (6468, 40))

In [37]:
set(data.columns) - set(test.columns)

{'log_price', 'price'}

In [16]:
def rmsle_cv(model, train, label):
    rmse_list = np.sqrt(-cross_val_score(model, train, label, scoring='neg_mean_squared_error', cv=5))
    
    return np.round(np.mean(rmse_list), 4)

In [17]:
def r2_cv(model, train, label):
    r2_list = cross_val_score(model, train, label, scoring='r2', cv=5)
    
    return np.round(np.mean(r2_list), 4)

In [18]:
def rmse(model, y_test, y_pred):

    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [19]:
def get_scores(models, train, label, rmse_=True):
    df = {}
    
    for model in models:
        model_name = model.__class__.__name__
        df[model_name] = []
        df[model_name].append(rmsle_cv(model, train, label))
        df[model_name].append(r2_cv(model, train, label))
        
        if rmse:
            X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            df[model_name].append(rmse(model, y_test, y_pred))
        
        if rmse:
            score_df = pd.DataFrame(df, index=['RMSLE', 'R2 score', 'RMSE']).T.sort_values('R2 score', \
                                                                                           ascending=False)
        else:
            score_df = pd.DataFrame(df, index=['RMSLE', 'R2 score']).T.sort_values('R2 score', \
                                                                                   ascending=False)
            
    return score_df

In [20]:
def get_coef(models, sort_model):
    coef = {}
    
    for model in models:
        model_name = model.__class__.__name__
        coef[model_name] = model.coef_

    coef_df = pd.DataFrame(coef, index=train.columns)
    
    return coef_df.sort_values(sort_model.__class__.__name__, ascending=False)

In [21]:
def draw_coefs(models):
    if len(models) == 3:
        figure, axs = plt.subplots(nrows=1, ncols=3)
        figure.set_size_inches(18, 6)

        for idx, model in enumerate(models):
            model_name = model.__class__.__name__
            sns.barplot(x=get_coef(models, model)[model_name].values, \
                        y=get_coef(models, model)[model_name].index, \
                        ax=axs[idx]).set_title(model_name)
    else:
        figure, axs = plt.subplots(nrows=2, ncols=2)
        figure.set_size_inches(18, 12)

        for idx, model in enumerate(models):
            model_name = model.__class__.__name__
            sns.barplot(x=get_coef(models, model)[model_name].values, \
                        y=get_coef(models, model)[model_name].index, \
                        ax=axs[idx//2][idx%2]).set_title(model_name)
    
    plt.show()

In [22]:
def draw_importances(models, data):
    if type(models) != list:
        model = models
        model_name = model.__class__.__name__
        importances = [i for i in zip(data.columns, model.feature_importances_)]
        importances = sorted([[i[0], float(i[1])] for i in importances], key=lambda i: i[1], reverse=True)
        plt.figure(figsize=(8, 6))
        sns.barplot(x=[i[1] for i in importances], y=[i[0] for i in importances]).set_title(model_name)
    
    elif len(models) == 3:
        figure, axs = plt.subplots(nrows=1, ncols=3)
        figure.set_size_inches(18, 6)

        for idx, model in enumerate(models):
            model_name = model.__class__.__name__
            importances = [i for i in zip(train.columns, model.feature_importances_)]
            importances = sorted([[i[0], float(i[1])] for i in importances], key=lambda i: i[1], reverse=True)
            sns.barplot(x=[i[1] for i in importances], y=[i[0] for i in importances], \
                        ax=axs[idx]).set_title(model_name)

    else:
        figure, axs = plt.subplots(nrows=2, ncols=2)
        figure.set_size_inches(18, 12)

        for idx, model in enumerate(models):
            model_name = model.__class__.__name__
            importances = [i for i in zip(train.columns, model.feature_importances_)]
            importances = sorted([[i[0], float(i[1])] for i in importances], key=lambda i: i[1], reverse=True)
            sns.barplot(x=[i[1] for i in importances], y=[i[0] for i in importances], \
                        ax=axs[idx//2][idx%2]).set_title(model_name)

    plt.show()

In [23]:
def BlendingModels(models, x, y, sub_x):
    for m in models : 
        m.fit(x.values, y)
    
    predictions = np.column_stack([
        m.predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [24]:
def LetSubmit(y_pred, filename):
    submission = pd.read_csv("./data/sample_submission.csv")
    submission['price'] = y_pred
    submission.to_csv("./submission/{}.csv".format(filename), index_label=False, index=False)
    submission_test = pd.read_csv("./submission.csv")
    
    return submission_test

## Get Best Parameters

- GridSearchCV 메서드를 이용해 Best Parameter 를 출력하는 함수

In [25]:
def BestParamsGrid(model, train, label, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    grid_model.fit(train, label)
    rmsle = np.sqrt(-1 * grid_model.best_score_)
    params_df = pd.DataFrame(grid_model.best_params_, index=[grid_model.__class__.__name__]).T

    return grid_model, grid_model.best_params_, rmsle

## CoarseSearch

- 정해져 있는 범위 내에서 랜덤으로 파라미터를 지정해 parameter searchm

In [26]:
def CoarseSearch(train, label, model_name='lgbm', num_epoch=100):
    params_list = []
    
    start_time = datetime.now()
    print("Let's start\t\t\t\t")
    
    for epoch in range(1, num_epoch+1):
        n_estimators = np.random.randint(low=100, high=1000)
        max_depth = np.random.randint(low=2, high=100)
        learning_rate = 10 ** -np.random.uniform(low=0, high=10)
        subsample = np.random.uniform(low=0.4, high=1.0)
        colsample_bytree = np.random.uniform(low=0.4, high=1.0)
        colsample_bylevel = np.random.uniform(low=0.4, high=1.0)
        num_leaves = np.random.randint(low=5, high=1000)
        min_child_samples = np.random.randint(low=50, high=2000)
        
        if model_name == 'lgbm':
            model = LGBMRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  subsample=subsample,
                                  colsample_bylevel=colsample_bylevel,
                                  colsample_bytree=colsample_bytree,
                                  num_leaves=num_leaves,
                                  min_child_samples=min_child_samples,
                                  seed=37)
            
        elif model_name == 'xgb':
            model = XGBRegressor(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 learning_rate=learning_rate,
                                 subsample=subsample,
                                 colsample_bylevel=colsample_bylevel,
                                 colsample_bytree=colsample_bytree,
                                 seed=37)

        score = np.sqrt(-cross_val_score(model, train, label, cv=8, scoring='neg_mean_squared_error').mean())

        params = {
            'epoch': epoch,
            'score': score,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'subsample': subsample,
            'colsample_bylevel': colsample_bylevel,
            'colsample_bytree': colsample_bytree,
            'num_leaves': num_leaves,
            'min_child_samples': min_child_samples,
        }

        params_list.append(params)
        
        if (epoch == 1) | (epoch % (num_epoch//10) == 0):
            print("{} epoch is now calculating . . .\t".format(epoch), str(datetime.now() - start_time)[:-7])
            
    finish_time = datetime.now()
    print("All Coarse Searching is Finished")
    print("All taken:", str(finish_time - start_time)[:-7])

    params_df = pd.DataFrame.from_dict(params_list).sort_values('score')
    print(params_df.shape)
    
    return params_df

## Finer Search

- parameter range를 인자로 받아 그 안에서 랜덤으로 파라미터를 지정해 parameter search

In [77]:
def FinerSearch(train, label, param_range, model_name='lgbm', num_epoch=100):
    params_list = []
    
    start_time = datetime.now()
    print("Let's start\t\t\t\t")
    
    for epoch in range(1, num_epoch+1):
        n_estimators = np.random.randint(low=param_range['n_estimators'][0], \
                                         high=param_range['n_estimators'][1])
        max_depth = np.random.randint(low=param_range['max_depth'][0], \
                                      high=param_range['max_depth'][1])
        learning_rate = np.random.uniform(low=param_range['learning_rate'][0], \
                                          high=param_range['learning_rate'][1])
        subsample = np.random.uniform(low=param_range['subsample'][0], \
                                      high=param_range['subsample'][1])
        colsample_bytree = np.random.uniform(low=param_range['colsample_bytree'][0], \
                                             high=param_range['colsample_bytree'][1])
        colsample_bylevel = np.random.uniform(low=param_range['colsample_bylevel'][0], \
                                              high=param_range['colsample_bylevel'][1])
        num_leaves = np.random.randint(low=param_range['num_leaves'][0], \
                                       high=param_range['num_leaves'][1])
        min_child_samples = np.random.randint(low=param_range['min_child_samples'][0], \
                                              high=param_range['min_child_samples'][1])
        
        if model_name == 'lgbm':
            model = LGBMRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  subsample=subsample,
                                  colsample_bylevel=colsample_bylevel,
                                  colsample_bytree=colsample_bytree,
                                  num_leaves=num_leaves,
                                  min_child_samples=min_child_samples,
                                  seed=37)
            
            score = np.sqrt(-cross_val_score(model, train, label, cv=8, \
                                             scoring='neg_mean_squared_error', n_jobs=5).mean())

            params = {
                'epoch': epoch,
                'score': score,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'subsample': subsample,
                'colsample_bylevel': colsample_bylevel,
                'colsample_bytree': colsample_bytree,
                'num_leaves': num_leaves,
                'min_child_samples': min_child_samples,
            }
            
        elif model_name == 'xgb':
            model = XGBRegressor(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 learning_rate=learning_rate,
                                 seed=37)

            score = np.sqrt(-cross_val_score(model, train, label, cv=8, \
                                             scoring='neg_mean_squared_error', n_jobs=5).mean())

            params = {
                'epoch': epoch,
                'score': score,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
            }

        params_list.append(params)
        
        if (epoch == 1) | (epoch % (num_epoch//10) == 0):
            print("{} epoch is now calculating . . .\t".format(epoch), str(datetime.now() - start_time)[:-7])
            
    finish_time = datetime.now()
    print("All Coarse Searching is Finished")
    print("time duration:", str(finish_time - start_time)[:-7])

    params_df = pd.DataFrame.from_dict(params_list).sort_values('score')
    print(params_df.shape)
    
    return params_df

---
---

## \#SUBMISSION 6 : 3번 + `year_month`

- 이전에서 가장 결과가 좋았던 SUBMISSION #6 을 사용합니다.

In [38]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \
            'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \
            'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year_month']

train = data[features]
data['log_price'] = np.log1p(data.price)
label = data['log_price']

In [39]:
gboost = GradientBoostingRegressor()
xgboost = XGBRegressor()
lightgbm = LGBMRegressor()
rdforest = RandomForestRegressor()

models = [gboost, xgboost, lightgbm, rdforest]

In [392]:
%%time
row_scores = get_scores(models, train, label)

CPU times: user 43.2 s, sys: 1.15 s, total: 44.4 s
Wall time: 26.3 s


In [393]:
row_scores

Unnamed: 0,RMSLE,R2 score,RMSE
LGBMRegressor,0.1634,0.9033,121344.899765
XGBRegressor,0.1831,0.8786,131035.538895
GradientBoostingRegressor,0.1831,0.8785,125378.071093
RandomForestRegressor,0.1878,0.8739,120613.126149
LinearRegression,0.251,0.7717,177622.786772
Ridge,0.251,0.7717,182631.804389
Lasso,0.3586,0.5339,249896.050268


# Coarse / Finer Search

### LightGBM Tuning

- 먼저 랜덤으로 러프하게 CoarseSearch 진행

In [126]:
lgbm_tuning_df = CoarseSearch(train, label, 'lgbm', 500)

Let's start				 16:03:11
1 epoch is now calculating . . .	 16:03:12
50 epoch is now calculating . . .	 16:06:06
100 epoch is now calculating . . .	 16:08:16
150 epoch is now calculating . . .	 16:11:17
200 epoch is now calculating . . .	 16:13:14
250 epoch is now calculating . . .	 16:16:31
300 epoch is now calculating . . .	 16:18:55
350 epoch is now calculating . . .	 16:21:06
400 epoch is now calculating . . .	 16:24:28
450 epoch is now calculating . . .	 16:27:44
500 epoch is now calculating . . .	 16:30:25
All Coarse Searching is Finished
time duration: 0:27:13
(500, 10)


- 결과 확인

In [154]:
lgbm_tuning_df.head(10).sort_values('learning_rate')

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
180,0.654386,0.921381,181,0.043654,50,495,569,348,0.170261,0.93695
152,0.702562,0.529863,153,0.045601,51,431,572,126,0.167844,0.860284
343,0.45302,0.535704,344,0.055449,43,288,814,634,0.163962,0.943352
228,0.901923,0.925929,229,0.07678,30,747,896,730,0.171047,0.569215
313,0.971499,0.814491,314,0.098614,82,416,293,568,0.167425,0.836102
19,0.711771,0.625738,20,0.106436,86,493,772,961,0.166852,0.536372
490,0.456682,0.451429,491,0.187445,70,621,585,712,0.171147,0.410388
188,0.54385,0.426447,189,0.201176,13,580,578,576,0.171205,0.491564
250,0.647272,0.509298,251,0.237333,11,446,824,281,0.169666,0.673658
120,0.92668,0.47835,121,0.298567,62,449,575,949,0.170741,0.782413


- 상위 10개를 확인 후 대략적으로 좁혀진 파라미터 범위를 기반으로 Finer Search 진행

In [156]:
param_range = {
    'n_estimators': [500, 900],
    'max_depth': [30, 90],
    'learning_rate': [0, 2],
    'subsample': [0.5, 1],
    'colsample_bylevel': [0.45, 1],
    'colsample_bytree': [0.4, 0.9],
    'num_leaves': [300, 1000],
    'min_child_samples': [300, 800],
}

In [157]:
lgbm_finer_df = FinerSearch(train, label, param_range, 'lgbm', 300)

Let's start				 17:25:36
1 epoch is now calculating . . .	 17:25:40
50 epoch is now calculating . . .	 17:29:24
100 epoch is now calculating . . .	 17:33:18
150 epoch is now calculating . . .	 17:37:13
200 epoch is now calculating . . .	 17:41:15
250 epoch is now calculating . . .	 17:45:06
300 epoch is now calculating . . .	 17:49:02
All Coarse Searching is Finished
time duration: 0:23:25
(300, 10)


In [160]:
lgbm_finer_df.head(10)#.sort_values('max_depth')

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
38,0.68896,0.598889,39,0.050034,82,310,711,989,0.164657,0.87948
44,0.809189,0.576079,45,0.035605,40,312,887,655,0.16478,0.643502
55,0.890466,0.665797,56,0.047123,72,309,715,728,0.164784,0.569443
49,0.588493,0.524845,50,0.050481,32,340,761,472,0.165075,0.513969
180,0.798097,0.576222,181,0.084062,69,333,500,766,0.165193,0.660387
92,0.808857,0.451993,93,0.071855,52,417,867,369,0.165211,0.546055
13,0.666552,0.474292,14,0.067878,88,402,827,990,0.165275,0.846135
255,0.777473,0.767652,256,0.082195,64,318,552,986,0.165422,0.851954
72,0.791209,0.663936,73,0.056838,88,410,889,680,0.165542,0.671157
275,0.71699,0.44811,276,0.119619,42,314,714,344,0.165626,0.588893


### 제출해보기

In [161]:
best_params = {
    'n_estimators': 711,
    'max_depth': 82,
    'learning_rate': 0.050034,
    'subsample': 0.879480,
    'colsample_bylevel': 0.688960,
    'colsample_bytree': 0.598889,
    'num_leaves': 989,
    'min_child_samples': 310,
}

In [169]:
lgbm_tun = LGBMRegressor(n_estimators=711,
                         max_depth=82,
                         learning_rate=0.050034,
                         subsample=0.879480,
                         colsample_bylevel=0.688960,
                         colsample_bytree=0.598889,
                         num_leaves=989,
                         min_child_samples=310)

X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

X_sub = test[features]

lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

submission = LetSubmit(np.expm1(y_pred), "six_lgbm_tuning")
submission.head()

In [174]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

110716.45541102542

In [175]:
X_sub = test[features]

In [176]:
lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

array([13.1560804 , 13.07160478, 14.23687242, ..., 13.05612279,
       12.61816677, 13.00301265])

In [177]:
submission = LetSubmit(np.expm1(y_pred), "six_lgbm_coarse")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission 결과 : 129199

- 음.. 좋지 않다.

# GridSearchCV - `LightGBM`

- 위에서 Coarse / Finer Search 의 결과가 좋지 않았으므로 일단 GridSearch를 진행해본다.

In [211]:
param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 30, 50, 80, 100],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   16.4s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.9min
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed:  3.9min finished


In [251]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200} 	RMSE: 0.1618
2: {'learning_rate': 0.1, 'max_depth': 30, 'n_estimators': 200} 	RMSE: 0.162
3: {'learning_rate': 0.1, 'max_depth': 100, 'n_estimators': 200} 	RMSE: 0.162
4: {'learning_rate': 0.1, 'max_depth': 50, 'n_estimators': 200} 	RMSE: 0.162
5: {'learning_rate': 0.1, 'max_depth': 80, 'n_estimators': 200} 	RMSE: 0.162
6: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.162
7: {'learning_rate': 0.1, 'max_depth': 30, 'n_estimators': 500} 	RMSE: 0.1623
8: {'learning_rate': 0.1, 'max_depth': 50, 'n_estimators': 500} 	RMSE: 0.1623
9: {'learning_rate': 0.1, 'max_depth': 100, 'n_estimators': 500} 	RMSE: 0.1623
10: {'learning_rate': 0.1, 'max_depth': 80, 'n_estimators': 500} 	RMSE: 0.1623


- learning rate 가 0.1로 고정되어있으므로 learning rate를 좀 더 variance 하게 주는 것을 중심으로 다시 Grid Serach

In [252]:
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [10, 20, 30, 50],
    'learning_rate': [0.01, 0.05, 0.08, 0.1, 0.12],
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    9.4s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   40.1s
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 400 out of 400 | elapsed:  1.6min finished


In [255]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.161
2: {'learning_rate': 0.08, 'max_depth': 20, 'n_estimators': 400} 	RMSE: 0.1613
3: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 400} 	RMSE: 0.1614
4: {'learning_rate': 0.08, 'max_depth': 50, 'n_estimators': 400} 	RMSE: 0.1614
5: {'learning_rate': 0.08, 'max_depth': 30, 'n_estimators': 400} 	RMSE: 0.1614
6: {'learning_rate': 0.08, 'max_depth': 20, 'n_estimators': 300} 	RMSE: 0.1614
7: {'learning_rate': 0.08, 'max_depth': 50, 'n_estimators': 300} 	RMSE: 0.1614
8: {'learning_rate': 0.08, 'max_depth': 30, 'n_estimators': 300} 	RMSE: 0.1614
9: {'learning_rate': 0.05, 'max_depth': 50, 'n_estimators': 500} 	RMSE: 0.1614
10: {'learning_rate': 0.05, 'max_depth': 30, 'n_estimators': 500} 	RMSE: 0.1614


In [256]:
param_grid = {
    'n_estimators': [300, 350, 400, 450, 500],
    'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50],
    'learning_rate': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12],
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    8.6s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   41.7s
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  3.0min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed:  4.6min
[Parallel(n_jobs=5)]: Done 1448 tasks      | elapsed:  6.6min
[Parallel(n_jobs=5)]: Done 1800 out of 1800 | elapsed:  8.0min finished


In [257]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.06, 'max_depth': 15, 'n_estimators': 500} 	RMSE: 0.1608
2: {'learning_rate': 0.06, 'max_depth': 15, 'n_estimators': 450} 	RMSE: 0.1608
3: {'learning_rate': 0.06, 'max_depth': 20, 'n_estimators': 500} 	RMSE: 0.1609
4: {'learning_rate': 0.06, 'max_depth': 50, 'n_estimators': 500} 	RMSE: 0.1609
5: {'learning_rate': 0.06, 'max_depth': 35, 'n_estimators': 500} 	RMSE: 0.1609
6: {'learning_rate': 0.06, 'max_depth': 25, 'n_estimators': 500} 	RMSE: 0.1609
7: {'learning_rate': 0.06, 'max_depth': 40, 'n_estimators': 500} 	RMSE: 0.1609
8: {'learning_rate': 0.06, 'max_depth': 45, 'n_estimators': 500} 	RMSE: 0.1609
9: {'learning_rate': 0.06, 'max_depth': 30, 'n_estimators': 500} 	RMSE: 0.1609
10: {'learning_rate': 0.06, 'max_depth': 15, 'n_estimators': 400} 	RMSE: 0.161


- learning rate 는 0.06이 최적인 것 같아 보인다. 


- 나머지 파라미터 (max_depth, n_estimators) 를 변화를 주면서 Finer Search 진행

In [274]:
param_grid = {
    'n_estimators': [450, 550],
    'max_depth': [15, 40],
    'learning_rate': [0.055, 0.065],
}

lgbm_finer = FinerSearch(train, label, param_grid, 'lgbm', 100)

Let's start				 21:10:48
1 epoch is now calculating . . .	 21:10:51
10 epoch is now calculating . . .	 21:11:10
20 epoch is now calculating . . .	 21:11:32
30 epoch is now calculating . . .	 21:11:55
40 epoch is now calculating . . .	 21:12:17
50 epoch is now calculating . . .	 21:12:40
60 epoch is now calculating . . .	 21:13:02
70 epoch is now calculating . . .	 21:13:25
80 epoch is now calculating . . .	 21:13:47
90 epoch is now calculating . . .	 21:14:10
100 epoch is now calculating . . .	 21:14:32
All Coarse Searching is Finished
time duration: 0:03:44
(100, 5)


In [298]:
lgbm_finer.head(10).describe()

Unnamed: 0,epoch,learning_rate,max_depth,n_estimators,score
count,10.0,10.0,10.0,10.0,10.0
mean,54.4,0.057212,16.5,485.2,0.159846
std,30.869618,0.001455,1.269296,16.198765,7.5e-05
min,5.0,0.055727,15.0,450.0,0.159749
25%,32.25,0.056004,16.0,483.5,0.15978
50%,49.0,0.056487,16.0,491.5,0.159845
75%,83.0,0.058558,17.0,495.75,0.159917
max,97.0,0.059654,19.0,499.0,0.159927


### WOW! `0.159749`

- score가 0.159대로 떨어졌다.

- 나머지 parameter도 최적화를 진행해보자.

In [297]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    colsample_bylevel = results.loc[i]['params']['colsample_bylevel']
    colsample_bytree = results.loc[i]['params']['colsample_bytree']
    min_child_samples = results.loc[i]['params']['min_child_samples']
    num_leaves = results.loc[i]['params']['num_leaves']
    subsample = results.loc[i]['params']['subsample']
    print("[{}] col_bylevel: {}, col_bytree: {}, m_c_s: {}, num_l: {}, subsm: {}".format(i+1, \
                                                                                         colsample_bylevel, \
                                                                                         colsample_bytree, \
                                                                                         min_child_samples, \
                                                                                         num_leaves, \
                                                                                         subsample), \
          "\t\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

[1] col_bylevel: 0.7, col_bytree: 0.7, m_c_s: 50, num_l: 300, subsm: 0.5 		RMSE: 0.1618
[2] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 300, subsm: 0.5 		RMSE: 0.1618
[3] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 300, subsm: 0.8 		RMSE: 0.1618
[4] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 300, subsm: 1 		RMSE: 0.1618
[5] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 500, subsm: 0.5 		RMSE: 0.1618
[6] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 500, subsm: 0.8 		RMSE: 0.1618
[7] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 500, subsm: 1 		RMSE: 0.1618
[8] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 800, subsm: 0.5 		RMSE: 0.1618
[9] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 800, subsm: 0.8 		RMSE: 0.1618
[10] col_bylevel: 1, col_bytree: 0.7, m_c_s: 50, num_l: 800, subsm: 1 		RMSE: 0.1618


In [306]:
param_grid = {
    'n_estimators': [450, 499],
    'max_depth': [15, 19],
    'learning_rate': [0.0557, 0.0596],
    'subsample': [0.5, 1],
    'colsample_bylevel': [0.7, 1],
    'colsample_bytree': [0.6, 0.8],
    'num_leaves': [300, 500],
    'min_child_samples': [10, 100],
}

lgbm = LGBMRegressor()
lgbm_finer_2 = FinerSearch(train, label, param_grid, 'lgbm', 300)

Let's start				 21:52:27
1 epoch is now calculating . . .	 0:00:04
30 epoch is now calculating . . .	 0:02:07
60 epoch is now calculating . . .	 0:04:09
90 epoch is now calculating . . .	 0:06:30
120 epoch is now calculating . . .	 0:08:39
150 epoch is now calculating . . .	 0:10:53
180 epoch is now calculating . . .	 0:13:05
210 epoch is now calculating . . .	 0:15:20
240 epoch is now calculating . . .	 0:17:27
270 epoch is now calculating . . .	 0:19:48
300 epoch is now calculating . . .	 0:22:18
All Coarse Searching is Finished
time duration: 0:22:18
(300, 10)


In [310]:
lgbm_finer_2.head(10).describe()

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.883418,0.675169,156.9,0.057366,15.6,81.8,465.8,423.1,0.159904,0.756813
std,0.051347,0.058778,72.130206,0.001321,1.074968,4.709329,11.516172,43.11857,0.000101,0.150676
min,0.810936,0.605996,50.0,0.055803,15.0,73.0,453.0,362.0,0.15966,0.511139
25%,0.832726,0.623305,103.75,0.056313,15.0,78.5,457.5,393.25,0.159877,0.653925
50%,0.895476,0.665913,165.5,0.057125,15.0,82.0,464.0,412.0,0.159931,0.780676
75%,0.9217,0.723228,187.25,0.058512,15.75,85.5,468.25,447.25,0.159962,0.86233
max,0.939318,0.768216,293.0,0.059222,18.0,88.0,491.0,491.0,0.160005,0.970658


In [312]:
param_grid = {
    'n_estimators': [450, 490],
    'max_depth': [15, 17],
    'learning_rate': [0.0558, 0.0592],
    'subsample': [0.5, 1],
    'colsample_bylevel': [0.8109, 0.9393],
    'colsample_bytree': [0.6060, 0.7682],
    'num_leaves': [350, 490],
    'min_child_samples': [70, 90],
}

lgbm = LGBMRegressor()
lgbm_finer_3 = FinerSearch(train, label, param_grid, 'lgbm', 100)

Let's start				 22:18:46
1 epoch is now calculating . . .	 0:00:03
10 epoch is now calculating . . .	 0:00:31
20 epoch is now calculating . . .	 0:01:06
30 epoch is now calculating . . .	 0:01:38
40 epoch is now calculating . . .	 0:02:10
50 epoch is now calculating . . .	 0:02:41
60 epoch is now calculating . . .	 0:03:12
70 epoch is now calculating . . .	 0:03:44
80 epoch is now calculating . . .	 0:04:16
90 epoch is now calculating . . .	 0:04:49
100 epoch is now calculating . . .	 0:05:21
All Coarse Searching is Finished
time duration: 0:05:21
(100, 10)


In [314]:
lgbm_finer_3.head(10).describe()

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.886023,0.641096,36.7,0.057053,15.3,83.1,465.9,406.1,0.159846,0.786443
std,0.032775,0.047574,22.430386,0.000719,0.483046,5.858517,12.591443,38.999858,3.4e-05,0.151209
min,0.838117,0.606442,10.0,0.056098,15.0,74.0,450.0,359.0,0.159776,0.543293
25%,0.857084,0.614128,18.75,0.056284,15.0,78.0,455.0,381.5,0.159836,0.657393
50%,0.888125,0.62229,29.0,0.057348,15.0,85.0,463.5,399.0,0.159843,0.846149
75%,0.918066,0.628536,59.0,0.057599,15.75,87.5,477.0,413.0,0.159864,0.870474
max,0.925448,0.737596,68.0,0.057922,16.0,89.0,484.0,476.0,0.159892,0.969138


In [315]:
param_grid = {
    'n_estimators': [450, 480],
    'max_depth': [15, 16],
    'learning_rate': [0.056, 0.0579],
    'subsample': [0.54, 0.97],
    'colsample_bylevel': [0.838, 0.925],
    'colsample_bytree': [0.6064, 0.737],
    'num_leaves': [350, 480],
    'min_child_samples': [74, 89],
}

lgbm = LGBMRegressor()
lgbm_finer_4 = FinerSearch(train, label, param_grid, 'lgbm', 100)

Let's start				 22:27:50
1 epoch is now calculating . . .	 0:00:02
10 epoch is now calculating . . .	 0:00:28
20 epoch is now calculating . . .	 0:00:57
30 epoch is now calculating . . .	 0:01:26
40 epoch is now calculating . . .	 0:01:56
50 epoch is now calculating . . .	 0:02:26
60 epoch is now calculating . . .	 0:02:56
70 epoch is now calculating . . .	 0:03:26
80 epoch is now calculating . . .	 0:03:57
90 epoch is now calculating . . .	 0:04:27
100 epoch is now calculating . . .	 0:04:57
All Coarse Searching is Finished
time duration: 0:04:57
(100, 10)


In [321]:
lgbm_finer_4.head(10)#.describe()

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
38,0.918582,0.626395,39,0.057606,15,83,465,465,0.159645,0.960575
98,0.885215,0.611295,99,0.056405,15,78,461,458,0.159659,0.753503
91,0.889167,0.616803,92,0.057896,15,76,468,389,0.159661,0.86015
80,0.864706,0.667957,81,0.057171,15,74,468,462,0.159723,0.911559
67,0.864528,0.630756,68,0.056741,15,83,450,464,0.159756,0.817356
50,0.894323,0.615414,51,0.056543,15,77,463,441,0.159766,0.929069
95,0.902519,0.633855,96,0.056623,15,87,476,455,0.159796,0.923251
64,0.920799,0.615419,65,0.057857,15,88,469,366,0.15981,0.699336
45,0.908884,0.723677,46,0.05688,15,86,472,381,0.15982,0.726196
59,0.904931,0.680647,60,0.057292,15,83,450,374,0.159831,0.71783


In [318]:
param_grid = {
    'n_estimators': [450, 476],
    'max_depth': [15, 19],
    'learning_rate': [0.056, 0.0579],
    'subsample': [0.7, 0.96],
    'colsample_bylevel': [0.86, 0.92],
    'colsample_bytree': [0.61, 0.72],
    'num_leaves': [366, 465],
    'min_child_samples': [74, 88],
}

lgbm = LGBMRegressor()
lgbm_finer_5 = FinerSearch(train, label, param_grid, 'lgbm', 100)

Let's start				 22:37:18
1 epoch is now calculating . . .	 0:00:03
10 epoch is now calculating . . .	 0:00:29
20 epoch is now calculating . . .	 0:01:00
30 epoch is now calculating . . .	 0:01:31
40 epoch is now calculating . . .	 0:02:03
50 epoch is now calculating . . .	 0:02:35
60 epoch is now calculating . . .	 0:03:06
70 epoch is now calculating . . .	 0:03:38
80 epoch is now calculating . . .	 0:04:10
90 epoch is now calculating . . .	 0:04:42
100 epoch is now calculating . . .	 0:05:13
All Coarse Searching is Finished
time duration: 0:05:13
(100, 10)


In [320]:
lgbm_finer_5.head(10)#.describe()

Unnamed: 0,colsample_bylevel,colsample_bytree,epoch,learning_rate,max_depth,min_child_samples,n_estimators,num_leaves,score,subsample
77,0.883408,0.680897,78,0.056331,15,74,452,425,0.159735,0.820913
3,0.861548,0.610949,4,0.056137,15,83,463,404,0.15974,0.713915
72,0.901237,0.697109,73,0.056988,15,74,466,464,0.159811,0.871708
35,0.884681,0.650134,36,0.056007,15,82,454,417,0.15982,0.771452
96,0.869893,0.670911,97,0.056878,16,86,462,403,0.159855,0.935893
60,0.874411,0.629861,61,0.05743,17,87,454,428,0.159859,0.925825
19,0.917199,0.620533,20,0.057452,16,84,467,453,0.15989,0.747736
88,0.860472,0.684594,89,0.057091,15,81,451,408,0.159899,0.950582
40,0.911315,0.698385,41,0.056582,15,82,458,376,0.159928,0.807325
4,0.907082,0.615837,5,0.056589,16,80,453,384,0.15993,0.824228


### 제출해보기

In [11]:
lgbm_tun = LGBMRegressor(n_estimators=452,
                         max_depth=15,
                         learning_rate=0.056331,
                         subsample=0.820913,
                         colsample_bylevel=0.883408,
                         colsample_bytree=0.680897,
                         num_leaves=425,
                         min_child_samples=74)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

108567.78137669293

In [13]:
X_sub = test[features]

In [14]:
lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

array([13.13987797, 13.07027698, 14.07649284, ..., 13.0887623 ,
       12.66086141, 13.02491798])

In [28]:
submission = LetSubmit(np.expm1(y_pred), "six_lgbm_finetuning_2")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission : 113879

- 딱히 괄목할 만한 성능 향상은 보이지 않는다..

---

### 다시 !

- 위에서 처음부터 n_estimators를 500까지로 제한해서 시작했으므로 좀 더 높여서 시작해보자

In [78]:
param_grid = {
    'n_estimators': [400, 600, 800, 1000],
    'max_depth': [10, 15, 20, 30, 40, 50],
    'learning_rate': [0.055, 0.0575, 0.06, 0.0625, 0.065]
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   15.4s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.2min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.7min
[Parallel(n_jobs=5)]: Done 600 out of 600 | elapsed:  4.5min finished


In [79]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 600} 	RMSE: 0.1608
2: {'learning_rate': 0.06, 'max_depth': 20, 'n_estimators': 600} 	RMSE: 0.1608
3: {'learning_rate': 0.06, 'max_depth': 50, 'n_estimators': 600} 	RMSE: 0.1608
4: {'learning_rate': 0.06, 'max_depth': 30, 'n_estimators': 600} 	RMSE: 0.1608
5: {'learning_rate': 0.06, 'max_depth': 40, 'n_estimators': 600} 	RMSE: 0.1608
6: {'learning_rate': 0.055, 'max_depth': 30, 'n_estimators': 600} 	RMSE: 0.1609
7: {'learning_rate': 0.055, 'max_depth': 50, 'n_estimators': 600} 	RMSE: 0.1609
8: {'learning_rate': 0.055, 'max_depth': 40, 'n_estimators': 600} 	RMSE: 0.1609
9: {'learning_rate': 0.055, 'max_depth': 20, 'n_estimators': 600} 	RMSE: 0.1609
10: {'learning_rate': 0.0575, 'max_depth': 15, 'n_estimators': 600} 	RMSE: 0.1609


In [97]:
param_grid = {
    'n_estimators': [500, 550, 575, 600, 625, 650, 700],
    'max_depth': [10, 15, 20, 30, 40, 50],
    'learning_rate': [0.055, 0.0575, 0.06, 0.0625, 0.065]
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   13.4s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   57.4s
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed:  6.2min
[Parallel(n_jobs=5)]: Done 1050 out of 1050 | elapsed:  6.5min finished


In [98]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 600} 	RMSE: 0.1608
2: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.1608
3: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 625} 	RMSE: 0.1608
4: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 575} 	RMSE: 0.1608
5: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 650} 	RMSE: 0.1608
6: {'learning_rate': 0.055, 'max_depth': 10, 'n_estimators': 550} 	RMSE: 0.1608
7: {'learning_rate': 0.06, 'max_depth': 50, 'n_estimators': 650} 	RMSE: 0.1608
8: {'learning_rate': 0.06, 'max_depth': 40, 'n_estimators': 650} 	RMSE: 0.1608
9: {'learning_rate': 0.06, 'max_depth': 20, 'n_estimators': 650} 	RMSE: 0.1608
10: {'learning_rate': 0.06, 'max_depth': 30, 'n_estimators': 650} 	RMSE: 0.1608


In [99]:
param_grid = {
    'n_estimators': [575, 590, 605, 620, 635, 650],
    'max_depth': [5, 7, 9, 11, 13, 15, 20, 30],
    'learning_rate': [0.055, 0.0575, 0.06, 0.0625]
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   10.3s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   55.4s
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 960 out of 960 | elapsed:  6.0min finished


In [108]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 620} 	RMSE: 0.1607
2: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 635} 	RMSE: 0.1608
3: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 575} 	RMSE: 0.1608
4: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 650} 	RMSE: 0.1608
5: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 605} 	RMSE: 0.1608
6: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 590} 	RMSE: 0.1608
7: {'learning_rate': 0.0575, 'max_depth': 15, 'n_estimators': 635} 	RMSE: 0.1608
8: {'learning_rate': 0.06, 'max_depth': 20, 'n_estimators': 650} 	RMSE: 0.1608
9: {'learning_rate': 0.06, 'max_depth': 30, 'n_estimators': 650} 	RMSE: 0.1608
10: {'learning_rate': 0.06, 'max_depth': 20, 'n_estimators': 620} 	RMSE: 0.1608


- 좁혀진 범위 중심으로 Finer Search 진행

In [80]:
param_grid = {
    'n_estimators': [400, 800],
    'max_depth': [10, 40],
    'learning_rate': [0.05, 0.06],
}

lgbm_finer = FinerSearch(train, label, param_grid, 'lgbm', 500)

Let's start				
1 epoch is now calculating . . .	 0:00:03
50 epoch is now calculating . . .	 0:02:13
100 epoch is now calculating . . .	 0:04:23
150 epoch is now calculating . . .	 0:06:39
200 epoch is now calculating . . .	 0:08:48
250 epoch is now calculating . . .	 0:10:58
300 epoch is now calculating . . .	 0:13:13
350 epoch is now calculating . . .	 0:15:33
400 epoch is now calculating . . .	 0:17:46
450 epoch is now calculating . . .	 0:19:54
500 epoch is now calculating . . .	 0:22:05
All Coarse Searching is Finished
time duration: 0:22:05
(500, 5)


In [81]:
lgbm_finer.head(10)#.describe()

Unnamed: 0,epoch,learning_rate,max_depth,n_estimators,score
445,446,0.058623,28,744,0.159368
189,190,0.053025,16,746,0.159479
313,314,0.058599,30,578,0.15952
77,78,0.053853,21,646,0.159523
260,261,0.054217,29,791,0.159544
48,49,0.055531,11,660,0.159562
425,426,0.050321,28,659,0.159569
322,323,0.053776,28,626,0.159595
288,289,0.055727,32,536,0.159613
40,41,0.052281,10,584,0.159625


In [93]:
lgbm_finer.head(10).describe()

Unnamed: 0,epoch,learning_rate,max_depth,n_estimators,score
count,10.0,10.0,10.0,10.0,10.0
mean,241.7,0.054595,23.3,657.0,0.15954
std,147.962495,0.002622,8.206366,82.159736,7.5e-05
min,41.0,0.050321,10.0,536.0,0.159368
25%,106.0,0.053213,17.25,594.5,0.159521
50%,275.0,0.054035,28.0,652.5,0.159553
75%,320.75,0.055678,28.75,723.0,0.159589
max,446.0,0.058623,32.0,791.0,0.159625


In [82]:
lgbm_tun = LGBMRegressor(n_estimators=744,
                         max_depth=28,
                         learning_rate=0.058623)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

109039.72709429951

In [84]:
X_sub = test[features]

In [85]:
lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

array([13.15509942, 13.08171598, 14.06585446, ..., 13.0966762 ,
       12.71298993, 13.00234827])

In [86]:
submission = LetSubmit(np.expm1(y_pred), "six_lgbm_finetuning_3")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission : 111952

In [102]:
lgbm_tun = LGBMRegressor(n_estimators=746,
                         max_depth=16,
                         learning_rate=0.053)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

131105.5425702168

In [104]:
X_sub = test[features]

In [105]:
lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

array([13.12295718, 13.08707634, 14.08961966, ..., 13.08480778,
       12.71580981, 12.98786158])

In [106]:
submission = LetSubmit(np.expm1(y_pred), "six_lgbm_finetuning_4")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission : 111247

## 다시 2

- 안되겠다. Grid Search를 죄다 몽땅 줘버려서 찾아보자

In [110]:
param_grid = {
    'n_estimators': [300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000],
    'max_depth': [5, 7, 9, 11, 13, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80],
    'learning_rate': [0.03, 0.04, 0.05, 0.055, 0.06, 0.065, 0.07, 0.08]
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 1920 candidates, totalling 9600 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    8.6s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.0min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  2.6min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  4.8min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed:  7.6min
[Parallel(n_jobs=5)]: Done 1448 tasks      | elapsed: 10.8min
[Parallel(n_jobs=5)]: Done 1975 tasks      | elapsed: 14.7min
[Parallel(n_jobs=5)]: Done 2582 tasks      | elapsed: 19.2min
[Parallel(n_jobs=5)]: Done 3271 tasks      | elapsed: 24.1min
[Parallel(n_jobs=5)]: Done 4040 tasks      | elapsed: 29.5min
[Parallel(n_jobs=5)]: Done 4891 tasks      | elapsed: 35.4min
[Parallel(n_jobs=5)]: Done 5822 tasks      | elapsed: 41.8min
[Parallel(n_jobs=5)]: Done 6835 tasks      | elapsed: 48.7min
[Parallel(n_jobs=5)]: Done 7928 tasks      | elapsed: 57.5min
[Parallel(n_jobs=5)]: Done 9103 tasks      | elapsed: 67.2min

In [113]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(20):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 900} 	RMSE: 0.1605
2: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 850} 	RMSE: 0.1605
3: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 800} 	RMSE: 0.1605
4: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 850} 	RMSE: 0.1605
5: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 900} 	RMSE: 0.1606
6: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 950} 	RMSE: 0.1606
7: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 800} 	RMSE: 0.1606
8: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 1000} 	RMSE: 0.1606
9: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 750} 	RMSE: 0.1606
10: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 950} 	RMSE: 0.1606
11: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 750} 	RMSE: 0.1606
12: {'learning_rate': 0.03, 'max_depth': 9, 'n_estimators': 1000} 	RMSE: 0.1607
13: {'learning_rate': 0.04, 'max_depth': 13, 'n_estima

In [115]:
param_grid = {
    'n_estimators': [800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200],
    'max_depth': [7, 9, 11, 13, 15, 17, 19, 21],
    'learning_rate': [0.03, 0.035, 0.04, 0.045, 0.05]
}

lgbm = LGBMRegressor()
grid_lgbm, params_df, rmsle = BestParamsGrid(lgbm, train, label, param_grid)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   26.2s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed:  7.6min
[Parallel(n_jobs=5)]: Done 1003 tasks      | elapsed: 11.9min
[Parallel(n_jobs=5)]: Done 1448 tasks      | elapsed: 16.9min
[Parallel(n_jobs=5)]: Done 1800 out of 1800 | elapsed: 21.0min finished


In [116]:
results = pd.DataFrame(grid_lgbm.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.035, 'max_depth': 7, 'n_estimators': 900} 	RMSE: 0.1605
2: {'learning_rate': 0.035, 'max_depth': 7, 'n_estimators': 950} 	RMSE: 0.1605
3: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 900} 	RMSE: 0.1605
4: {'learning_rate': 0.04, 'max_depth': 15, 'n_estimators': 850} 	RMSE: 0.1605
5: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 800} 	RMSE: 0.1605
6: {'learning_rate': 0.035, 'max_depth': 11, 'n_estimators': 950} 	RMSE: 0.1605
7: {'learning_rate': 0.035, 'max_depth': 11, 'n_estimators': 900} 	RMSE: 0.1605
8: {'learning_rate': 0.04, 'max_depth': 9, 'n_estimators': 850} 	RMSE: 0.1605
9: {'learning_rate': 0.035, 'max_depth': 11, 'n_estimators': 1000} 	RMSE: 0.1605
10: {'learning_rate': 0.035, 'max_depth': 7, 'n_estimators': 1000} 	RMSE: 0.1606


In [120]:
lgbm_tun = LGBMRegressor(n_estimators=900,
                         max_depth=7,
                         learning_rate=0.035)

In [121]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
lgbm_tun.fit(X_train, y_train)
y_pred = lgbm_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

110750.3796780717

In [122]:
X_sub = test[features]

In [123]:
lgbm_tun.fit(train, label)
y_pred = lgbm_tun.predict(X_sub)
y_pred

array([13.13494496, 13.10383562, 14.08974416, ..., 13.08982018,
       12.6919973 , 12.97130319])

In [124]:
submission = LetSubmit(np.expm1(y_pred), "lgbm_finetuning_5")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission : 110482

- 드디어 성능 향상이....ㅠ.ㅠ

- LGBM 최고 점수 : `110482`로 마무리

---

# GridSearchCV - `XGBoost`

- XGBoost도 파라미터 튜닝을 해보자

In [42]:
param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 30, 50, 80, 100],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
}

xgb = XGBRegressor()
grid_xgb, params_df, rmsle = BestParamsGrid(xgb, train, label, param_grid)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   46.8s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  9.5min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed: 27.6min
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed: 30.5min finished


In [43]:
results = pd.DataFrame(grid_xgb.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000} 	RMSE: 0.1623
2: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 800} 	RMSE: 0.1634
3: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200} 	RMSE: 0.1638
4: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.1641
5: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 800} 	RMSE: 0.1642
6: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1000} 	RMSE: 0.1642
7: {'learning_rate': 0.01, 'max_depth': 50, 'n_estimators': 1000} 	RMSE: 0.1656
8: {'learning_rate': 0.01, 'max_depth': 80, 'n_estimators': 1000} 	RMSE: 0.1657
9: {'learning_rate': 0.01, 'max_depth': 100, 'n_estimators': 1000} 	RMSE: 0.1657
10: {'learning_rate': 0.01, 'max_depth': 30, 'n_estimators': 1000} 	RMSE: 0.1659


- 많이 줄었네...

In [44]:
param_grid = {
    'n_estimators': [500, 600, 700, 800, 900, 1000],
    'max_depth': [10, 20, 30, 40, 50, 60],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
}

xgb = XGBRegressor()
grid_xgb, params_df, rmsle = BestParamsGrid(xgb, train, label, param_grid)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  2.9min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 29.0min
[Parallel(n_jobs=5)]: Done 355 tasks      | elapsed: 64.0min
[Parallel(n_jobs=5)]: Done 638 tasks      | elapsed: 92.2min
[Parallel(n_jobs=5)]: Done 900 out of 900 | elapsed: 110.9min finished


In [45]:
results = pd.DataFrame(grid_xgb.cv_results_)
results['RMSE'] = np.sqrt(-1 * results.mean_test_score)
results = results.sort_values('RMSE').reset_index(inplace=False)
for i in range(10):
    print("{}:".format(i+1), results.loc[i]['params'], \
          "\tRMSE:", np.round(results.loc[i]['RMSE'], 4))

1: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 800} 	RMSE: 0.1618
2: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 600} 	RMSE: 0.1619
3: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 700} 	RMSE: 0.1619
4: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 900} 	RMSE: 0.1619
5: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 1000} 	RMSE: 0.1619
6: {'learning_rate': 0.03, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.162
7: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500} 	RMSE: 0.1622
8: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000} 	RMSE: 0.1623
9: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 600} 	RMSE: 0.1623
10: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 700} 	RMSE: 0.1624


In [49]:
param_grid = {
    'n_estimators': [700, 900],
    'max_depth': [7, 13],
    'learning_rate': [0.02, 0.04],
}

xgb_finer_1 = FinerSearch(train, label, param_grid, 'xgb', 300)

Let's start				
1 epoch is now calculating . . .	 0:00:59
30 epoch is now calculating . . .	 0:26:26
60 epoch is now calculating . . .	 0:53:49
90 epoch is now calculating . . .	 1:19:52
120 epoch is now calculating . . .	 1:46:15
150 epoch is now calculating . . .	 2:12:40
180 epoch is now calculating . . .	 2:38:49
210 epoch is now calculating . . .	 3:04:50
240 epoch is now calculating . . .	 3:29:12
270 epoch is now calculating . . .	 3:54:20
300 epoch is now calculating . . .	 4:19:53
All Coarse Searching is Finished
time duration: 4:19:53
(300, 5)


In [50]:
xgb_finer_1.head()

Unnamed: 0,epoch,learning_rate,max_depth,n_estimators,score
108,109,0.03939,7,793,0.15934
240,241,0.033561,7,813,0.159378
281,282,0.028396,8,778,0.159389
84,85,0.033804,8,872,0.159414
95,96,0.039124,7,765,0.159415


In [60]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', \
            'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', \
            'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'year_month']

train = data[features]
data['log_price'] = np.log1p(data.price)
label = data['log_price']

In [61]:
xgb_tun = XGBRegressor(n_estimators=793,
                       max_depth=7,
                       learning_rate=0.03939)

In [62]:
X_sub = test[features]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2)
xgb_tun.fit(X_train, y_train)
y_pred = xgb_tun.predict(X_test)
np.sqrt(mean_squared_error(np.expm1(y_pred), np.expm1(y_test)))

123667.24174846783

In [65]:
xgb_tun.fit(train, label)
y_pred = xgb_tun.predict(X_sub)
y_pred

array([13.168792, 13.01605 , 14.110845, ..., 13.082218, 12.68552 ,
       13.031048], dtype=float32)

In [66]:
submission = LetSubmit(np.expm1(y_pred), "xgb_finetuning")
submission.head()

Unnamed: 0,id,price
0,15035,494290.6
1,15036,474450.8
2,15037,1256917.0
3,15038,306866.4
4,15039,286101.1


> Submission : 107728

- 헉 LGBM보다 더 좋은 성능을 보인다. 
- XGB 최고 점수 : `107728` 로 마무리