# Import

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Data Load

In [20]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data Pre-processing

In [21]:
le = LabelEncoder()

### propertyType LabelEncoding

In [22]:
train['propertyType'] = le.fit_transform(train['propertyType'])

for label in np.unique(test['propertyType']):
    if label not in le.classes_: 
        le.classes_ = np.append(le.classes_, label)
test['propertyType'] = le.transform(test['propertyType'])

### suburbName LabelEncoding

In [23]:
def convert_SN(x) :
    
    if x == 'West Delhi' :
        return 'Delhi West'
    elif x == 'North Delhi' :
        return 'Delhi North'
    else :
        return x

In [24]:
train['suburbName'] = train['suburbName'].apply(convert_SN)
test['suburbName'] = test['suburbName'].apply(convert_SN)

In [25]:
train['suburbName'] = le.fit_transform(train['suburbName'])

for label in np.unique(test['suburbName']):
    if label not in le.classes_: 
        le.classes_ = np.append(le.classes_, label)
test['suburbName'] = le.transform(test['suburbName'])

### dist_ranking

- distance 중 최소 거리의 index : Metro는 제외
- 결과는 0, 1, 2 중 하나의 값

In [26]:
dist_cols = train.columns[train.columns.str.contains('distance')]

In [27]:
list(dist_cols)

['distanceMetro(km)',
 'distanceAirport(km)',
 'distanceHospital(km)',
 'distanceRailway(km)']

In [28]:
train['dist_ranking'] = [np.argmin(v) for v in train[dist_cols[1:]].values]

In [29]:
test['dist_ranking'] = [np.argmin(v) for v in test[dist_cols[1:]].values]

### dist_skew
- distance 값들의 왜도

In [30]:
train['dist_skew'] = train[dist_cols].skew(axis = 1)

In [31]:
test['dist_skew'] = test[dist_cols].skew(axis = 1)

### dist_cls
- 각 시설까지의 거리 값들로 k-means clustering

In [32]:
km = KMeans(n_clusters = 8, max_iter = 500, n_init = 15)

In [33]:
train['dist_cls'] = km.fit_predict(train[dist_cols])
test['dist_cls'] = km.predict(test[dist_cols])

### infra_cls
- 부동산과 관련된 지표로 k-means clustering

In [34]:
infra_cols = ['propertyType', 'bedrooms', 'area(square_meters)']

In [35]:
km = KMeans(n_clusters = 6, random_state = 42)

In [36]:
train['infra_cls'] = km.fit_predict(train[infra_cols])
test['infra_cls'] = km.predict(test[infra_cols])

***
# Modeling

- 10-fold로 앙상블
- 타겟 값을 로그 변환하여 예측 후 지수 변환
- CatBoost LGBM 6:4로 앙상블

In [37]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [38]:
X = train.drop(['ID', 'monthlyRent(us_dollar)'], axis = 1)
y = np.log1p(train['monthlyRent(us_dollar)'])

In [39]:
target = test[X.columns]

### CatBoost

In [41]:
cb_pred = np.zeros((target.shape[0]))
cb_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 6, learning_rate = 0.03, iterations = 10000, eval_metric = 'MAE')
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    
    val_pred = np.expm1(cb.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    cb_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
    
print(f"\n{cb.__class__.__name__} AVG of MAE : {cb_mae}")

1 Fold MAE : 60.622988505747124
2 Fold MAE : 53.420689655172424
3 Fold MAE : 54.448791714614494
4 Fold MAE : 47.51783659378596
5 Fold MAE : 53.85155350978136
6 Fold MAE : 55.918296892980436
7 Fold MAE : 56.780207134637514
8 Fold MAE : 54.570771001150746
9 Fold MAE : 56.364787111622555
10 Fold MAE : 56.15074798619102

CatBoostRegressor AVG of MAE : 54.96466701056837


### LGBM

In [43]:
lgbm_pred = np.zeros((target.shape[0]))
lgbm_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    lgbm = LGBMRegressor(random_state = 42, max_depth = 7, learning_rate = 0.03, n_estimators = 10000, objective = 'l1')
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0, eval_metric = 'l1')
    val_pred = np.expm1(lgbm.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    lgbm_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = lgbm.predict(target) / kf.n_splits
    lgbm_pred += fold_pred
print(f"\n{lgbm.__class__.__name__} AVG of MAE : {lgbm_mae}")

1 Fold MAE : 60.65172413793104
2 Fold MAE : 54.272413793103446
3 Fold MAE : 55.555811277330264
4 Fold MAE : 48.61910241657077
5 Fold MAE : 54.013808975834294
6 Fold MAE : 57.25431530494822
7 Fold MAE : 57.379746835443036
8 Fold MAE : 56.74798619102417
9 Fold MAE : 56.72382048331416
10 Fold MAE : 56.06559263521288

LGBMRegressor AVG of MAE : 55.72843220507123


***
# Submission

In [44]:
submission = pd.read_csv('sample_submission.csv')

In [45]:
submission['monthlyRent(us_dollar)'] = np.expm1(cb_pred * 0.6 + lgbm_pred * 0.4)

In [46]:
submission['monthlyRent(us_dollar)'] = submission['monthlyRent(us_dollar)'].astype(int)

In [47]:
submission.to_csv('catboost_lgbm.csv', index = False)