## Normalize Data Before Fit in the Data Set

In [34]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [35]:
train = pd.read_csv('train_c.csv')
test = pd.read_csv('test_c.csv')

In [36]:
col = list(test.columns)[2:]

In [37]:
label = np.log(train['price_doc'])

In [38]:
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [39]:
params = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 5,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [40]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:14.3665	test-rmse:14.3665
[50]	train-rmse:1.20143	test-rmse:1.20766
[100]	train-rmse:0.443987	test-rmse:0.475624
[150]	train-rmse:0.419087	test-rmse:0.464555
[200]	train-rmse:0.406446	test-rmse:0.4635
[250]	train-rmse:0.395101	test-rmse:0.463046
Performance does not improve from 256 rounds


In [41]:
model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))

In [42]:
pred = model.predict(dtest)

In [48]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    

In [49]:
get_feature_importance(model)

Unnamed: 0,Feature,Score
39,full_sq,528
43,floor,205
149,life_sq,198
177,build_year,181
151,max_floor,146
224,green_zone_km,91
84,kindergarten_km,79
117,metro_min_avto,79
130,state,74
210,kitch_sq,73


In [43]:
np.exp(pred)

array([ 5348592. ,  7727480. ,  4939120.5, ...,  4586150.5,  5204963. ,
        8177748. ], dtype=float32)

In [46]:
sub = pd.DataFrame({'id':test['id'],'price_doc':np.exp(pred)})
sub.to_csv('logtarget.csv',index=False)