In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## 피쳐 전처리

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

def remove_outlier(data_df):
    data_df = data_df.copy()
    data_df = data_df.drop(data_df[data_df["minimum_nights"] > 400].index)
    data_df = data_df.drop(data_df[data_df["price"] > 5000].index)
    return data_df

def preprocess_data(X, is_train):
    X = X.copy()

    # 사용하지 않는 피쳐 제거
    X = X.drop(columns=[
        "id",
        "name",
        "host_id",
        "host_name",
        "calculated_host_listings_count",
        "neighbourhood",
        "availability_365",
        "last_review",
    ])

    # neighbourhood_group: one-hot encoding
    X = pd.get_dummies(X, columns=["neighbourhood_group"], drop_first=True)

    # room_type one-hot encoding
    X = pd.get_dummies(X, columns=["room_type"], drop_first=True)

    # mimimum_nights: log 변환
    X["minimum_nights"] = np.log1p(X["minimum_nights"])

    # number_of_reviews, reviews_per_month: 차원 축소 및 log 변환
    X = X.drop(columns=["reviews_per_month"])
    X["number_of_reviews"] = np.log1p(X["number_of_reviews"])

    # feature scaling
    numeric_features = X.select_dtypes("number").columns.tolist()
    if is_train:
        scaler.fit(X[numeric_features])
    X[numeric_features] = scaler.transform(X[numeric_features])

    return X

In [15]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
y_test_df = pd.read_csv("./data/y_test.csv")

# train
train_df = remove_outlier(train_df)
X_train = train_df.drop(columns=["price"])
X_train = preprocess_data(X_train, is_train=True)
y_train = train_df["price"]

# test
X_test = test_df.copy()
X_test = preprocess_data(X_test, is_train=False)
y_test = y_test_df.copy()["price"]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((39092, 10), (39092,), (9779, 10), (9779,))

## baseline model과 비교

In [16]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

model = XGBRegressor(random_state=2022)

model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_squared_error(y_test, pred)

48387.558380601826

## Model 최적화

In [17]:
from pycaret.regression import *

reg = setup(data=X_train, target=y_train,
            train_size=0.8, fold=5, 
            n_jobs=-1,
            session_id=42)

best_models = compare_models(sort="mse", n_select=3)
best_models

Unnamed: 0,Description,Value
0,Session id,42
1,Target,price
2,Target type,Regression
3,Original data shape,"(39092, 11)"
4,Transformed data shape,"(39092, 11)"
5,Transformed train set shape,"(31273, 11)"
6,Transformed test set shape,"(7819, 11)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,63.8119,25624.3979,159.0593,0.185,0.4998,0.4489,2.518
gbr,Gradient Boosting Regressor,64.3448,25830.9496,159.6538,0.1793,0.503,0.4602,1.258
ridge,Ridge Regression,68.1503,26462.6249,161.5956,0.1592,0.5686,0.5134,0.02
lar,Least Angle Regression,68.1557,26462.6279,161.5957,0.1592,0.5688,0.5134,0.024
lr,Linear Regression,68.1557,26462.6284,161.5957,0.1592,0.5688,0.5134,0.026
br,Bayesian Ridge,68.1326,26462.8053,161.5956,0.1592,0.5674,0.5134,0.026
lasso,Lasso Regression,68.2246,26626.4265,162.0854,0.1542,0.5627,0.5307,0.032
llar,Lasso Least Angle Regression,68.2247,26626.4464,162.0855,0.1542,0.5627,0.5307,0.022
rf,Random Forest Regressor,66.5809,27161.5859,164.0465,0.1295,0.5111,0.465,2.352
huber,Huber Regressor,60.617,27692.2049,165.3004,0.1203,0.5066,0.3709,0.084


[LGBMRegressor(n_jobs=-1, random_state=42),
 GradientBoostingRegressor(random_state=42),
 Ridge(random_state=42)]

In [18]:
best_tuned_models = [tune_model(m) for m in best_models]
blended_model = blend_models(best_tuned_models)
model = finalize_model(blended_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,63.2083,25772.797,160.5391,0.1769,0.5038,0.4431
1,62.7859,23920.914,154.6639,0.2017,0.4972,0.4523
2,66.2574,31032.0666,176.1592,0.1874,0.4959,0.4498
3,62.2274,15649.822,125.0992,0.24,0.4977,0.4702
4,64.1561,29658.6225,172.2168,0.1874,0.4883,0.4428
Mean,63.727,25206.8444,157.7356,0.1987,0.4966,0.4516
Std,1.4136,5423.3226,18.0641,0.0221,0.0049,0.01


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,63.8278,26033.6397,161.3494,0.1686,0.5075,0.4496
1,63.0549,24248.1523,155.7182,0.1908,0.4985,0.4547
2,66.5684,31613.5608,177.802,0.1722,0.4979,0.452
3,62.0691,16113.9815,126.9409,0.2174,0.4971,0.4663
4,64.3011,30302.336,174.0757,0.1698,0.4911,0.4424
Mean,63.9643,25662.3341,159.1772,0.1838,0.4984,0.453
Std,1.5055,5481.8889,18.0262,0.0187,0.0052,0.0078


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,68.4399,26489.4991,162.7559,0.1541,0.5791,0.519
1,67.0566,25141.4936,158.5607,0.161,0.5724,0.5121
2,70.5035,32905.098,181.3976,0.1383,0.5642,0.505
3,66.1774,16459.0377,128.2928,0.2007,0.571,0.528
4,68.4986,31318.7719,176.9711,0.1419,0.5513,0.5027
Mean,68.1352,26462.7801,161.5956,0.1592,0.5676,0.5134
Std,1.4717,5777.6377,18.6985,0.0223,0.0094,0.0093


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,63.6614,25664.6016,160.2018,0.1804,0.5068,0.4561
1,62.6479,23990.4541,154.8885,0.1994,0.498,0.4573
2,66.3405,31428.638,177.2812,0.177,0.4987,0.4552
3,61.9203,15602.8724,124.9115,0.2423,0.4984,0.4723
4,64.2029,30017.7654,173.2564,0.1776,0.4908,0.4488
Mean,63.7546,25340.8663,158.1079,0.1953,0.4986,0.4579
Std,1.5157,5580.1665,18.514,0.0249,0.0051,0.0078


In [19]:
model

In [20]:
from sklearn.metrics import mean_squared_error

y_pred = predict_model(model, data=X_train)["prediction_label"]
print("Train data score:", mean_squared_error(y_train, y_pred))

y_pred = predict_model(model, data=X_test)["prediction_label"]
print("Test data score:", mean_squared_error(y_test, y_pred))

Train data score: 25328.76009025393


Test data score: 47520.259942760786


---

- Baseline: 54244.32571482477
- Final: 47520.259942760786

과제를 진행하면서 다양한 방법을 적용해보았는데 특히 pycaret을 처음 사용하면서 모델의 최적화를 시도해보았다.  
다만 피쳐 엔지니어링 방법이 적절한지에 대해서는 아직 의문이 든다.  
여러 코드를 보면서 방법에 대해서 비교해봐야할 것 같다.