In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
from ggplot import *



## Load Data

In [3]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')
macro = pd.read_csv('macro_c.csv')

## Add Known Additional Features

In [4]:
# Add month-year
train['timestamp'] = pd.to_datetime(train['timestamp'])
month_year = (train.timestamp.dt.month + train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)
test['timestamp'] = pd.to_datetime(test['timestamp'])
month_year = (test.timestamp.dt.month + test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (train.timestamp.dt.weekofyear + train.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)
week_year = (test.timestamp.dt.weekofyear + test.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
# Other feature engineering
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)
test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)
train.apartment_name=train.sub_area.astype(str) + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area.astype(str) + train['metro_km_avto'].astype(str)
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

In [5]:
macro_cols = ['timestamp',"balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]
macro = macro[macro_cols]
train = train.merge(macro,how='left',on='timestamp')
test = test.merge(macro,how='left',on='timestamp')

In [6]:
def get_ratio_preschool(df):
    df['ratio_preschool'] = df['children_preschool'] / (df['children_school'] + 1)
    return df

train = get_ratio_preschool(train)
test = get_ratio_preschool(test)

In [7]:
def get_extra_area(df):
    df['extra_area'] = df['full_sq'] - df['life_sq']
    return df

train = get_extra_area(train)
test = get_extra_area(test)

In [8]:
def get_floor_ratio(df):
    df['floor_ratio'] = df['max_floor'] - df['floor']
    return df

train = get_floor_ratio(train)
test = get_floor_ratio(test)

In [9]:
def get_room_avg_size(df):
    df['room_avg_size'] = (df['life_sq'] - df['kitch_sq']) / (df['num_room'] + 1)
    return df

train = get_room_avg_size(train)
test = get_room_avg_size(test)

**Re-tune parameters**

In [10]:
col = list(test.columns)[2:]
label = np.log(train['price_doc']+1)
dtrain = xgb.DMatrix(train[col],label)

In [14]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 4,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)

[0]	train-rmse:14.427+0.00330067	test-rmse:14.427+0.00742354
[50]	train-rmse:1.16517+0.00133374	test-rmse:1.16672+0.00635762
[100]	train-rmse:0.342834+0.00436135	test-rmse:0.350557+0.00720451
[150]	train-rmse:0.323627+0.00411237	test-rmse:0.335286+0.00962788
[200]	train-rmse:0.317533+0.0040073	test-rmse:0.332732+0.00961695
[250]	train-rmse:0.312219+0.00364157	test-rmse:0.331064+0.0097874
[300]	train-rmse:0.308103+0.00380056	test-rmse:0.330058+0.00945767
[350]	train-rmse:0.304139+0.0038777	test-rmse:0.329251+0.00941276
[400]	train-rmse:0.300696+0.00381726	test-rmse:0.328612+0.00929437
[450]	train-rmse:0.297461+0.00376035	test-rmse:0.328256+0.00900611
[500]	train-rmse:0.294611+0.00383063	test-rmse:0.328073+0.00890457


## Add PCA Features

In [15]:
from sklearn.decomposition import PCA

In [18]:
n_comp = 20
pca = PCA(n_components=n_comp,random_state=42)

In [19]:
train_feature = train[col].fillna(-999)
test_feature = test[col].fillna(-999)

In [21]:
pca_train =pca.fit_transform(train_feature)
pca_test = pca.transform(test_feature)

In [22]:
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca_train[:, i - 1]
    test['pca_' + str(i)] = pca_test[:, i - 1]

**Test Model Fitting **

In [25]:
col = list(test.columns)[2:]
dtrain = xgb.DMatrix(train[col],label)

In [26]:
params = {
   'eta': 0.05, ## Try 0.01,3,5
   'max_depth': 4,## Try 4,5,6
   'subsample': 0.7,
   'colsample_bytree': 0.7,
   'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
     verbose_eval=50, show_stdv=True,seed=42)

[0]	train-rmse:14.427+0.00330067	test-rmse:14.427+0.00742354
[50]	train-rmse:1.16336+0.000908352	test-rmse:1.16589+0.00562555
[100]	train-rmse:0.33118+0.00374468	test-rmse:0.347472+0.00754897
[150]	train-rmse:0.308303+0.00325189	test-rmse:0.332439+0.00986852
[200]	train-rmse:0.299501+0.00316027	test-rmse:0.330196+0.00959261
[250]	train-rmse:0.292343+0.00296309	test-rmse:0.328978+0.00935728
[300]	train-rmse:0.285756+0.00307791	test-rmse:0.328434+0.00914762
[350]	train-rmse:0.280013+0.00291006	test-rmse:0.327984+0.00895104
[400]	train-rmse:0.274641+0.00291855	test-rmse:0.327808+0.00877471
[450]	train-rmse:0.269769+0.00319053	test-rmse:0.327806+0.00867916


**Not a good idea**

## Transfer Price into US Dollar

In [29]:
price_table = pd.DataFrame({'index':range(len(train)),'timestamp':train['timestamp'],'price':train['price_doc'],'full_sq':train['full_sq']})

In [30]:
price_table

Unnamed: 0,full_sq,index,price,timestamp
0,43.0,0,5850000,2011-08-20
1,34.0,1,6000000,2011-08-23
2,43.0,2,5700000,2011-08-27
3,89.0,3,13100000,2011-09-01
4,77.0,4,16331452,2011-09-05
5,67.0,5,9100000,2011-09-06
6,25.0,6,5500000,2011-09-08
7,44.0,7,2000000,2011-09-09
8,42.0,8,5300000,2011-09-10
9,36.0,9,4650000,2011-09-16


In [31]:
test.tail()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19,pca_20
7657,38131,2016-05-26,52.2,31.8,10,12.0,5,1973.0,2.0,9.1,...,41284.820176,-66724.427493,-26791.515132,-17955.300631,-19848.628165,-9909.873821,5998.993651,1092.310003,1604.266167,2090.00579
7658,38132,2016-05-28,54.09,,14,,1,,2.0,,...,1604.424345,22445.312472,-4018.914918,3992.30024,-871.195293,-1078.643581,77.594483,147.12525,2497.157396,1708.728813
7659,38133,2016-05-30,41.08,,12,,1,,1.0,,...,83652.98311,-4164.963753,3961.321361,47820.101613,14163.213441,963.984293,-963.751634,708.743025,-266.660717,-263.706744
7660,38134,2016-05-30,34.8,19.8,8,9.0,5,1977.0,1.0,6.4,...,47317.444876,-16370.135393,-44070.456639,-20524.932262,-6706.518596,-5087.763339,16683.594107,-10532.875763,-4128.679298,-753.443767
7661,38135,2016-05-30,63.0,43.8,5,5.0,1,1973.0,3.0,7.1,...,-115875.70462,4757.623209,-40539.381716,47120.642171,-89597.4249,-3305.452317,2872.117761,-2403.021334,-412.657307,410.509275


In [32]:
train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19,pca_20
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,174695.032119,22309.804653,-16387.32743,-11852.21196,13014.740035,4904.251773,-781.065939,2463.868197,-797.369668,-616.871677
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,-12748.067451,-30662.884595,15482.727408,-38007.747346,19797.655885,-1546.601357,-4246.742194,-1120.524633,1546.075233,-41.266596
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,39843.452313,-5319.770193,12207.239699,4022.30696,5431.396683,-2131.801647,-1717.006692,-1914.721319,2512.447237,-899.369684
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,-15337.454337,-130654.858959,2194.105901,-28573.423786,-3196.047088,14504.010367,3139.187364,-911.59453,2470.311698,-231.882604
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,-133075.345463,-7649.688238,-37007.951339,33492.258625,-197348.99801,-2686.832533,1924.027432,-1383.707559,-113.07451,696.923298
