In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
le = LabelEncoder()

In [5]:
train['propertyType'] = le.fit_transform(train['propertyType'])
test['propertyType'] = le.transform(test['propertyType'])

In [6]:
def convert_SN(x) :
    
    if x == 'West Delhi' :
        return 'Delhi West'
    elif x == 'North Delhi' :
        return 'Delhi North'
    else :
        return x

In [7]:
train['suburbName'] = train['suburbName'].apply(convert_SN)
test['suburbName'] = test['suburbName'].apply(convert_SN)

In [8]:
train['suburbName'] = le.fit_transform(train['suburbName'])
test['suburbName'] = le.transform(test['suburbName'])

In [9]:
dist_cols = train.columns[train.columns.str.contains('distance')]

In [10]:
train['dist_skew'] = train[dist_cols].skew(axis = 1)
test['dist_skew'] = test[dist_cols].skew(axis = 1)

In [11]:
from sklearn.cluster import KMeans

In [12]:
km = KMeans(n_clusters = 8, max_iter = 500, n_init = 15)

In [13]:
train['dist_cls'] = km.fit_predict(train[dist_cols])
test['dist_cls'] = km.predict(test[dist_cols])

In [14]:
km = KMeans(n_clusters = 6, random_state = 42)

In [15]:
infra_cols = ['propertyType', 'bedrooms', 'area(square_meters)']

In [16]:
train['infra_cls'] = km.fit_predict(train[infra_cols])
test['infra_cls'] = km.predict(test[infra_cols])

In [17]:
from catboost import CatBoostRegressor

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [19]:
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

In [20]:
X = train.drop(['ID', 'monthlyRent(us_dollar)'], axis = 1)
y = np.log1p(train['monthlyRent(us_dollar)'])

In [23]:
target = test[X.columns]

In [24]:
cb_pred = np.zeros((target.shape[0]))
cb_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    cb = CatBoostRegressor(random_state = 42, max_depth = 6, learning_rate = 0.03, iterations = 10000, eval_metric = 'MAE')
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0)
    val_pred = np.expm1(cb.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    cb_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = cb.predict(target) / kf.n_splits
    cb_pred += fold_pred
print(f"\n{cb.__class__.__name__} AVG of MAE : {cb_mae}")

1 Fold MAE : 61.08965517241379
2 Fold MAE : 53.11609195402299
3 Fold MAE : 54.24050632911393
4 Fold MAE : 47.31645569620253
5 Fold MAE : 54.18757192174914
6 Fold MAE : 56.026467203682394
7 Fold MAE : 56.77905638665133
8 Fold MAE : 55.20253164556962
9 Fold MAE : 56.0897583429229
10 Fold MAE : 56.478711162255465

CatBoostRegressor AVG of MAE : 55.052680581458404


In [26]:
from lightgbm import LGBMRegressor

In [27]:
lgbm_pred = np.zeros((target.shape[0]))
lgbm_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    lgbm = LGBMRegressor(random_state = 42, max_depth = 5, learning_rate = 0.03, n_estimators = 10000, objective = 'l1')
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0, eval_metric = 'l1')
    val_pred = np.expm1(lgbm.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    lgbm_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = lgbm.predict(target) / kf.n_splits
    lgbm_pred += fold_pred
print(f"\n{lgbm.__class__.__name__} AVG of MAE : {lgbm_mae}")

1 Fold MAE : 61.00689655172415
2 Fold MAE : 54.63333333333333
3 Fold MAE : 55.22554660529344
4 Fold MAE : 49.67088607594937
5 Fold MAE : 53.92865362485615
6 Fold MAE : 57.647871116225566
7 Fold MAE : 57.177215189873415
8 Fold MAE : 56.936708860759495
9 Fold MAE : 57.05293440736479
10 Fold MAE : 57.01611047180668

LGBMRegressor AVG of MAE : 56.029615623718634


In [28]:
from xgboost import XGBRegressor

In [29]:
xgb_pred = np.zeros((target.shape[0]))
xgb_mae = 0
for i, idx in enumerate(kf.split(X, y)) :
    
    tr_x, tr_y = X.loc[idx[0]], y.loc[idx[0]]
    val_x, val_y = X.loc[idx[1]], y.loc[idx[1]]
    
    xgb = XGBRegressor(random_state = 42, max_depth = 7, learning_rate = 0.03, n_estimators = 10000)
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 1000, verbose = 0, eval_metric = 'mae')
    val_pred = np.expm1(xgb.predict(val_x)).astype(int)
    val_mae = mean_absolute_error(np.expm1(val_y), val_pred)
    xgb_mae += val_mae / kf.n_splits
    print(f"{i + 1} Fold MAE : {val_mae}")
    
    fold_pred = xgb.predict(target) / kf.n_splits
    xgb_pred += fold_pred
print(f"\n{xgb.__class__.__name__} AVG of MAE : {xgb_mae}")

1 Fold MAE : 60.644827586206894
2 Fold MAE : 53.99885057471264
3 Fold MAE : 56.307249712312995
4 Fold MAE : 48.767548906789415
5 Fold MAE : 54.65592635212889
6 Fold MAE : 57.29459148446491
7 Fold MAE : 58.27617951668584
8 Fold MAE : 56.029919447640964
9 Fold MAE : 57.64556962025316
10 Fold MAE : 57.65247410817031

XGBRegressor AVG of MAE : 56.127313730936606


In [30]:
submission = pd.read_csv('sample_submission.csv')

In [31]:
submission['monthlyRent(us_dollar)'] = np.expm1(cb_pred * 0.5 + lgbm_pred * 0.15 + xgb_pred * 0.35)

In [32]:
submission['monthlyRent(us_dollar)'] = submission['monthlyRent(us_dollar)'].astype(int)

In [33]:
submission.to_csv('ens1.csv', index = False)