# 아파트 실거래가 예측 Competition

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.filter

# Data Import

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')

In [3]:
train.head()

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor,transaction_real_price
0,0,7622,서울특별시,신교동,6-13,신현(101동),신교동 6-13 신현(101동),84.82,2002,200801,21~31,2,37500
1,1,5399,서울특별시,필운동,142,사직파크맨션,필운동 142 사직파크맨션,99.17,1973,200801,1~10,6,20000
2,2,3578,서울특별시,필운동,174-1,두레엘리시안,필운동 174-1 두레엘리시안,84.74,2007,200801,1~10,6,38500
3,3,10957,서울특별시,내수동,95,파크팰리스,내수동 95 파크팰리스,146.39,2003,200801,11~20,15,118000
4,4,10639,서울특별시,내수동,110-15,킹스매너,내수동 110-15 킹스매너,194.43,2004,200801,21~31,3,120000


In [4]:
test.head()

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor
0,1145756,10453,서울특별시,목동,938,청학,목동 938 청학,35.55,2002,201711,11~20,2
1,1198704,989,부산광역시,초량동,1143-8,고관맨션,초량동 1143-8 고관맨션,68.72,1977,201708,21~31,2
2,1222384,8597,부산광역시,괴정동,447-13,우림그린,괴정동 447-13 우림그린,72.54,1989,201710,11~20,2
3,1179897,11086,서울특별시,대치동,1007-2,풍림아이원4차(1007-2),대치동 1007-2 풍림아이원4차(1007-2),111.54,2004,201707,1~10,10
4,1223091,2121,부산광역시,다대동,1670,다대롯데캐슬블루,다대동 1670 다대롯데캐슬블루,119.6398,2014,201712,11~20,21


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216553 entries, 0 to 1216552
Data columns (total 13 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   transaction_id          1216553 non-null  int64  
 1   apartment_id            1216553 non-null  int64  
 2   city                    1216553 non-null  object 
 3   dong                    1216553 non-null  object 
 4   jibun                   1216553 non-null  object 
 5   apt                     1216553 non-null  object 
 6   addr_kr                 1216553 non-null  object 
 7   exclusive_use_area      1216553 non-null  float64
 8   year_of_completion      1216553 non-null  int64  
 9   transaction_year_month  1216553 non-null  int64  
 10  transaction_date        1216553 non-null  object 
 11  floor                   1216553 non-null  int64  
 12  transaction_real_price  1216553 non-null  int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 120.7+ M

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5463 entries, 0 to 5462
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   transaction_id          5463 non-null   int64  
 1   apartment_id            5463 non-null   int64  
 2   city                    5463 non-null   object 
 3   dong                    5463 non-null   object 
 4   jibun                   5463 non-null   object 
 5   apt                     5463 non-null   object 
 6   addr_kr                 5463 non-null   object 
 7   exclusive_use_area      5463 non-null   float64
 8   year_of_completion      5463 non-null   int64  
 9   transaction_year_month  5463 non-null   int64  
 10  transaction_date        5463 non-null   object 
 11  floor                   5463 non-null   int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 512.3+ KB


***
# Making Features

In [7]:
train['city_dong'] = train.city + train.dong
test['city_dong'] = test.city + test.dong

In [8]:
tr_n_apt = train.groupby('city_dong').apt.nunique().reset_index(name = 'n_apt')

In [9]:
train = pd.merge(train, tr_n_apt, how = 'left', on = 'city_dong')
test = pd.merge(test, tr_n_apt, how = 'left', on = 'city_dong')

In [11]:
tr_skew_cnt = train.groupby('city_dong').transaction_year_month.skew().reset_index(name = 'skew_cnt')
te_skew_cnt = test.groupby('city_dong').transaction_year_month.skew().reset_index(name = 'skew_cnt')

In [12]:
train = pd.merge(train, tr_skew_cnt, how = 'left', on = 'city_dong')
test = pd.merge(test, tr_skew_cnt, how = 'left', on = 'city_dong')

In [14]:
y = np.log1p(train.transaction_real_price)

In [15]:
kf = KFold(n_splits = 20, random_state = 523, shuffle = True)

In [16]:
def get_date(x) :
    if x == '1-10' :
        return '월초'
    elif x == '11-20' :
        return '중순'
    else :
        return '월말'

In [17]:
train['transaction_date'] = train['transaction_date'].apply(get_date)
test['transaction_date'] = test['transaction_date'].apply(get_date)

# Modeling

In [19]:
X = train[['n_apt', 'city', 'exclusive_use_area', 'year_of_completion', 'transaction_year_month', 'transaction_date', 'floor']]

In [20]:
X = pd.get_dummies(columns = ['city', 'transaction_date'], data = X)

In [21]:
target = test[['n_apt', 'city', 'exclusive_use_area', 'year_of_completion', 'transaction_year_month', 'transaction_date', 'floor']]

In [22]:
target = pd.get_dummies(columns = ['city', 'transaction_date'], data = target)

## 1) LGBMRegressor

In [23]:
lgbm = LGBMRegressor(random_state = 524, max_depth = 5, n_estimators = 10000, learning_rate = 0.08, objective = 'rmse')

In [24]:
rmse_list = []
lgbm_pred = np.zeros((target.shape[0]))
i = 0
for tr_idx, val_idx in kf.split(X, y) :
    i += 1
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 2000, early_stopping_rounds = 500)
    pred = [0 if x <0 else x for x in lgbm.predict(val_x)]
    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 20
    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(pred)))
    print(f'{i}FOLD Training....val_RMSE : {rmse}\n')
    rmse_list.append(rmse)
    lgbm_pred += sub_pred
print(f'\n{lgbm.__class__.__name__}의 20FOLD 평균 RMSE는 {np.mean(rmse_list)}')

Training until validation scores don't improve for 500 rounds
[2000]	training's rmse: 0.142531	valid_1's rmse: 0.142743
[4000]	training's rmse: 0.125747	valid_1's rmse: 0.126497
[6000]	training's rmse: 0.116494	valid_1's rmse: 0.117551
[8000]	training's rmse: 0.110018	valid_1's rmse: 0.111678
[10000]	training's rmse: 0.105502	valid_1's rmse: 0.10774
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.105502	valid_1's rmse: 0.10774
1FOLD Training....val_RMSE : 5379.210391096527

Training until validation scores don't improve for 500 rounds
[2000]	training's rmse: 0.142208	valid_1's rmse: 0.144335
[4000]	training's rmse: 0.125385	valid_1's rmse: 0.128741
[6000]	training's rmse: 0.116218	valid_1's rmse: 0.120413
[8000]	training's rmse: 0.109841	valid_1's rmse: 0.11487
[10000]	training's rmse: 0.105391	valid_1's rmse: 0.111134
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.105391	valid_1's rmse: 0.111134
2FOLD Training....val_RMSE : 5590.0

KeyboardInterrupt: 

***
## 2) XGBRegressor

In [73]:
xgb = XGBRegressor(random_state = 523, max_depth = 4, n_estimators = 5000, n_jobs = -1)

In [74]:
rmse_list = []
xgb_pred = np.zeros((target.shape[0]))
i = 0
for tr_idx, val_idx in kf.split(X, y) :
    i += 1
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 500, early_stopping_rounds = 500)
    pred = [0 if x <0 else x for x in xgb.predict(val_x)]
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)]) / 20
    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(pred)))
    print(f'{i}FOLD Training....val_RMSE : {rmse}\n')
    rmse_list.append(rmse)
    xgb_pred += sub_pred
print(f'\n{xgb.__class__.__name__}의 20FOLD 평균 RMSE는 {np.mean(rmse_list)}')

[0]	validation_0-rmse:6.88625	validation_1-rmse:6.88791
[499]	validation_0-rmse:0.16158	validation_1-rmse:0.16167
1FOLD Training....val_RMSE : 8329.068659735867

[0]	validation_0-rmse:6.88644	validation_1-rmse:6.88361
[499]	validation_0-rmse:0.16251	validation_1-rmse:0.16428
2FOLD Training....val_RMSE : 8168.7591342072965

[0]	validation_0-rmse:6.88637	validation_1-rmse:6.88600
[499]	validation_0-rmse:0.16271	validation_1-rmse:0.16251
3FOLD Training....val_RMSE : 8216.473956070318

[0]	validation_0-rmse:6.88636	validation_1-rmse:6.88619
[499]	validation_0-rmse:0.16323	validation_1-rmse:0.16469
4FOLD Training....val_RMSE : 8258.794813674074

[0]	validation_0-rmse:6.88632	validation_1-rmse:6.88600
[499]	validation_0-rmse:0.16227	validation_1-rmse:0.16497
5FOLD Training....val_RMSE : 8074.480956576757

[0]	validation_0-rmse:6.88634	validation_1-rmse:6.88580
[499]	validation_0-rmse:0.16298	validation_1-rmse:0.16250
6FOLD Training....val_RMSE : 8354.857356644674

[0]	validation_0-rmse:6.886

***
## 3) CatboostRegressor

In [77]:
cb = CatBoostRegressor(random_state = 521, depth = 5, silent = True)

In [78]:
rmse_list = []
cb_pred = np.zeros((target.shape[0]))
i = 0
for tr_idx, val_idx in kf.split(X, y) :
    i += 1
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], silent = True)
    pred = [0 if x < 0 else x for x in cb.predict(val_x)]
    sub_pred = np.expm1([0 if x < 0 else x for x in cb.predict(target)]) / 20
    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(pred)))
    print(f'{i}FOLD Training....val_RMSE : {rmse}')
    rmse_list.append(rmse)
    cb_pred += sub_pred

1FOLD Training....val_RMSE : 9088.919768530259
2FOLD Training....val_RMSE : 8974.30998309961
3FOLD Training....val_RMSE : 8964.010301587477
4FOLD Training....val_RMSE : 8847.794171641402
5FOLD Training....val_RMSE : 8512.11998616486
6FOLD Training....val_RMSE : 8993.779218473599
7FOLD Training....val_RMSE : 8812.846501582027
8FOLD Training....val_RMSE : 8712.505799811921
9FOLD Training....val_RMSE : 8997.157629391497
10FOLD Training....val_RMSE : 8812.748758898304
11FOLD Training....val_RMSE : 8929.810385992148
12FOLD Training....val_RMSE : 8968.783194827876
13FOLD Training....val_RMSE : 8737.814645453765
14FOLD Training....val_RMSE : 8953.545493717427
15FOLD Training....val_RMSE : 9014.819633608759
16FOLD Training....val_RMSE : 8792.936816476884
17FOLD Training....val_RMSE : 9228.731363571842
18FOLD Training....val_RMSE : 8833.910935562773
19FOLD Training....val_RMSE : 9142.293384685674
20FOLD Training....val_RMSE : 8705.097317365044


# Submission

In [45]:
submission['transaction_real_price'] = lgbm_pred# * .5 + xgb_pred  * .25 + cb_pred * .25

In [46]:
submission.to_csv('0523.csv', index = False)

In [47]:
lgbm_pred

array([26406.23523887, 15760.69432933, 11796.70560867, ...,
       60536.97592734, 42734.10626386, 17822.53048972])

lgbm = 5,442.02675