In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
le = LabelEncoder()

In [5]:
train['propertyType'] = le.fit_transform(train['propertyType'])
test['propertyType'] = le.transform(test['propertyType'])

In [6]:
from sklearn.cluster import KMeans

In [7]:
km = KMeans(n_clusters = 6, max_iter = 500, n_init = 15)

In [8]:
gps = ['latitude', 'longitude']

In [9]:
train['gps_cls'] = km.fit_predict(train[gps])

In [10]:
test['gps_cls'] = km.predict(test[gps])

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [12]:
from catboost import CatBoostRegressor

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [14]:
kf = KFold(n_splits = 8, random_state = 42, shuffle = True)

In [15]:
X = train.drop(['ID', 'suburbName', 'monthlyRent(us_dollar)'], axis = 1)
y = np.log1p(train['monthlyRent(us_dollar)'])

In [16]:
target = test[X.columns]

In [19]:
cb_pred = np.zeros((target.shape[0]))
cb_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 6, learning_rate = 0.03, iterations = 10000, eval_metric = 'MAE')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred = np.expm1(cb.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    cb_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
print(f"\n{cb.__class__.__name__} AVG of MAE : {cb_mae}")

1 Fold MAE : 58.74241030358786
2 Fold MAE : 53.873045078196874
3 Fold MAE : 50.29990800367985
4 Fold MAE : 53.48942042318306
5 Fold MAE : 55.82688766114182
6 Fold MAE : 56.5902394106814
7 Fold MAE : 56.52209944751381
8 Fold MAE : 56.61786372007366

CatBoostRegressor AVG of MAE : 55.2452342560073


In [20]:
submission = pd.read_csv('sample_submission.csv')

In [23]:
submission['monthlyRent(us_dollar)'] = np.expm1(cb_pred).astype(int)

In [24]:
submission

Unnamed: 0,ID,monthlyRent(us_dollar)
0,Test_0000,176
1,Test_0001,248
2,Test_0002,324
3,Test_0003,173
4,Test_0004,79
...,...,...
8688,Test_8688,361
8689,Test_8689,357
8690,Test_8690,213
8691,Test_8691,154


In [25]:
submission.to_csv('catboost_2.csv', index = False)