# 아파트 실거래가 예측 Competition

In [104]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

# Data Import

In [105]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')

In [106]:
train.head()

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor,transaction_real_price
0,0,7622,서울특별시,신교동,6-13,신현(101동),신교동 6-13 신현(101동),84.82,2002,200801,21~31,2,37500
1,1,5399,서울특별시,필운동,142,사직파크맨션,필운동 142 사직파크맨션,99.17,1973,200801,1~10,6,20000
2,2,3578,서울특별시,필운동,174-1,두레엘리시안,필운동 174-1 두레엘리시안,84.74,2007,200801,1~10,6,38500
3,3,10957,서울특별시,내수동,95,파크팰리스,내수동 95 파크팰리스,146.39,2003,200801,11~20,15,118000
4,4,10639,서울특별시,내수동,110-15,킹스매너,내수동 110-15 킹스매너,194.43,2004,200801,21~31,3,120000


In [107]:
test.head()

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,transaction_date,floor
0,1145756,10453,서울특별시,목동,938,청학,목동 938 청학,35.55,2002,201711,11~20,2
1,1198704,989,부산광역시,초량동,1143-8,고관맨션,초량동 1143-8 고관맨션,68.72,1977,201708,21~31,2
2,1222384,8597,부산광역시,괴정동,447-13,우림그린,괴정동 447-13 우림그린,72.54,1989,201710,11~20,2
3,1179897,11086,서울특별시,대치동,1007-2,풍림아이원4차(1007-2),대치동 1007-2 풍림아이원4차(1007-2),111.54,2004,201707,1~10,10
4,1223091,2121,부산광역시,다대동,1670,다대롯데캐슬블루,다대동 1670 다대롯데캐슬블루,119.6398,2014,201712,11~20,21


In [108]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1216553 entries, 0 to 1216552
Data columns (total 13 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   transaction_id          1216553 non-null  int64  
 1   apartment_id            1216553 non-null  int64  
 2   city                    1216553 non-null  object 
 3   dong                    1216553 non-null  object 
 4   jibun                   1216553 non-null  object 
 5   apt                     1216553 non-null  object 
 6   addr_kr                 1216553 non-null  object 
 7   exclusive_use_area      1216553 non-null  float64
 8   year_of_completion      1216553 non-null  int64  
 9   transaction_year_month  1216553 non-null  int64  
 10  transaction_date        1216553 non-null  object 
 11  floor                   1216553 non-null  int64  
 12  transaction_real_price  1216553 non-null  int64  
dtypes: float64(1), int64(6), object(6)
memory usage: 120.7+ M

In [109]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5463 entries, 0 to 5462
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   transaction_id          5463 non-null   int64  
 1   apartment_id            5463 non-null   int64  
 2   city                    5463 non-null   object 
 3   dong                    5463 non-null   object 
 4   jibun                   5463 non-null   object 
 5   apt                     5463 non-null   object 
 6   addr_kr                 5463 non-null   object 
 7   exclusive_use_area      5463 non-null   float64
 8   year_of_completion      5463 non-null   int64  
 9   transaction_year_month  5463 non-null   int64  
 10  transaction_date        5463 non-null   object 
 11  floor                   5463 non-null   int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 512.3+ KB


***
# Making Features

In [112]:
train.floor = train.floor + 4
test.floor = test.floor + 4

In [113]:
train['city_dong'] = train.city + train.dong
test['city_dong'] = test.city + test.dong

In [114]:
tr_n_apt = train.groupby('city_dong').apt.nunique().reset_index(name = 'n_apt')

In [115]:
train = pd.merge(train, tr_n_apt, how = 'left', on = 'city_dong')
test = pd.merge(test, tr_n_apt, how = 'left', on = 'city_dong')

In [116]:
tr_skew_cnt = train.groupby('city_dong').transaction_year_month.skew().reset_index(name = 'skew_cnt')

In [117]:
train = pd.merge(train, tr_skew_cnt, how = 'left', on = 'city_dong')
test = pd.merge(test, tr_skew_cnt, how = 'left', on = 'city_dong')

In [118]:
tr_skew_year = train.groupby('city_dong').year_of_completion.skew().reset_index(name = 'skew_year')

In [119]:
train = pd.merge(train, tr_skew_year, how = 'left', on = 'city_dong')
test = pd.merge(test, tr_skew_year, how = 'left', on = 'city_dong')

In [120]:
tr_price_skew = train.groupby('addr_kr').transaction_real_price.skew().reset_index(name = 'skew_price')

In [121]:
train = pd.merge(train, tr_price_skew, how = 'left', on = 'addr_kr').fillna(0)
test = pd.merge(test, tr_price_skew, how = 'left', on = 'addr_kr').fillna(0)

In [122]:
y = np.log1p(train.transaction_real_price)

In [123]:
kf = KFold(n_splits = 20, random_state = 524, shuffle = True)

In [124]:
def get_date(x) :
    if x == '1~10' :
        return '월초'
    elif x == '11~20' :
        return '중순'
    else :
        return '월말'

In [125]:
train['transaction_date'] = train['transaction_date'].apply(get_date)
test['transaction_date'] = test['transaction_date'].apply(get_date)

# Modeling

In [126]:
X = train[['skew_price', 'skew_year', 'skew_cnt', 'n_apt', 'city', 'exclusive_use_area', 'year_of_completion', 'transaction_year_month', 'transaction_date', 'floor']]

In [127]:
X = pd.get_dummies(columns = ['city', 'transaction_date'], data = X)

In [128]:
X.skew()

skew_price                0.964702
skew_year                 0.303468
skew_cnt                 -0.701124
n_apt                     2.646840
exclusive_use_area        1.227509
year_of_completion       -0.470423
transaction_year_month   -0.267548
floor                     1.324710
city_부산광역시                0.451716
city_서울특별시               -0.451716
transaction_date_월말       0.664626
transaction_date_월초       0.761978
transaction_date_중순       0.695888
dtype: float64

In [129]:
np.log1p(X).skew()

skew_price               -2.416750
skew_year                -1.211180
skew_cnt                 -1.827815
n_apt                    -0.410704
exclusive_use_area       -0.438156
year_of_completion       -0.481260
transaction_year_month   -0.268997
floor                     0.089636
city_부산광역시                     NaN
city_서울특별시                     NaN
transaction_date_월말            NaN
transaction_date_월초            NaN
transaction_date_중순            NaN
dtype: float64

In [130]:
target = test[['skew_price', 'skew_year', 'skew_cnt', 'n_apt', 'city', 'exclusive_use_area', 'year_of_completion', 'transaction_year_month', 'transaction_date', 'floor']]

In [131]:
target = pd.get_dummies(columns = ['city', 'transaction_date'], data = target)

In [132]:
X[['floor', 'n_apt', 'exclusive_use_area']] = np.log1p(X[['floor', 'n_apt', 'exclusive_use_area']])
target[['floor', 'n_apt', 'exclusive_use_area']] = np.log1p(target[['floor', 'n_apt', 'exclusive_use_area']])

## LGBMRegressor

In [134]:
lgbm = LGBMRegressor(random_state = 524, max_depth = 5, n_estimators = 10000, learning_rate = 0.08, objective = 'rmse')

In [135]:
rmse_list = []
lgbm_pred = np.zeros((target.shape[0]))
i = 0
for tr_idx, val_idx in kf.split(X, y) :
    i += 1
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 5000, early_stopping_rounds = 1000)
    pred = [0 if x <0 else x for x in lgbm.predict(val_x)]
    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)]) / 20
    rmse = np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(pred)))
    print(f'{i}FOLD Training....val_RMSE : {rmse}\n')
    rmse_list.append(rmse)
    lgbm_pred += sub_pred
print(f'\n{lgbm.__class__.__name__}의 20FOLD 평균 RMSE는 {np.mean(rmse_list)}')

Training until validation scores don't improve for 1000 rounds
[5000]	training's rmse: 0.0885286	valid_1's rmse: 0.0908975
[10000]	training's rmse: 0.0775639	valid_1's rmse: 0.0818374
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0775639	valid_1's rmse: 0.0818374
1FOLD Training....val_RMSE : 4057.5416454212536

Training until validation scores don't improve for 1000 rounds
[5000]	training's rmse: 0.0884213	valid_1's rmse: 0.0905584
[10000]	training's rmse: 0.0776747	valid_1's rmse: 0.0819007
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0776747	valid_1's rmse: 0.0819007
2FOLD Training....val_RMSE : 4313.261626656358

Training until validation scores don't improve for 1000 rounds
[5000]	training's rmse: 0.0884487	valid_1's rmse: 0.0927962
[10000]	training's rmse: 0.0774757	valid_1's rmse: 0.083558
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0774757	valid_1's rmse: 0.083558
3FOLD Training....val_RMSE

[5000]	training's rmse: 0.0883511	valid_1's rmse: 0.0930867
[10000]	training's rmse: 0.0775891	valid_1's rmse: 0.0843981
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0775891	valid_1's rmse: 0.0843981
18FOLD Training....val_RMSE : 3973.9439074506195

Training until validation scores don't improve for 1000 rounds
[5000]	training's rmse: 0.0882221	valid_1's rmse: 0.0939217
[10000]	training's rmse: 0.0773363	valid_1's rmse: 0.0853932
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0773363	valid_1's rmse: 0.0853932
19FOLD Training....val_RMSE : 4234.7529217290075

Training until validation scores don't improve for 1000 rounds
[5000]	training's rmse: 0.0884314	valid_1's rmse: 0.0905443
[10000]	training's rmse: 0.0776499	valid_1's rmse: 0.0815147
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 0.0776499	valid_1's rmse: 0.0815147
20FOLD Training....val_RMSE : 3794.215729715343


LGBMRegressor의 20FOLD 평균 RMSE는 408

# Submission

In [136]:
submission['transaction_real_price'] = lgbm_pred# * .5 + xgb_pred  * .25 + cb_pred * .25

In [137]:
submission.to_csv('0523.csv', index = False)

In [138]:
submission

Unnamed: 0,transaction_id,transaction_real_price
0,1145756,25986.319179
1,1198704,12642.689013
2,1222384,12485.969265
3,1179897,108915.909523
4,1223091,46261.284146
...,...,...
5458,1174640,63169.975196
5459,1175575,194443.318010
5460,1157024,63835.813024
5461,1136863,32391.060030


lgbm = 3741.9269949247