In [1]:
import pandas as pd
import numpy as np

In [23]:
train = pd.read_csv('train_c.csv')
test = pd.read_csv('test_c.csv')

In [24]:
col = list(test.columns)[2:]
ID_related = [each for each in col if 'ID' in each]

In [25]:
ID_related

['ID_metro',
 'ID_railroad_station_walk',
 'ID_railroad_station_avto',
 'ID_big_road1',
 'ID_big_road2',
 'ID_railroad_terminal',
 'ID_bus_terminal']

In [26]:
for each in ID_related:
    train[each]=pd.factorize(train[each], sort=True)[0]
    test[each]=pd.factorize(test[each],sort=True)[0]

## Refit them to xgboost first

In [35]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    

In [30]:
import xgboost as xgb



In [28]:
label = train['price_doc']*0.95+10

In [31]:
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [32]:
params = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 5,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [33]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:7.7958e+06	test-rmse:7.80037e+06
[50]	train-rmse:2.39673e+06	test-rmse:2.76554e+06
[100]	train-rmse:2.08028e+06	test-rmse:2.60207e+06
[150]	train-rmse:1.96621e+06	test-rmse:2.56766e+06
[200]	train-rmse:1.88547e+06	test-rmse:2.54829e+06
[250]	train-rmse:1.81905e+06	test-rmse:2.53698e+06
[300]	train-rmse:1.76224e+06	test-rmse:2.53218e+06
[350]	train-rmse:1.71035e+06	test-rmse:2.52561e+06
[400]	train-rmse:1.66281e+06	test-rmse:2.52316e+06
[450]	train-rmse:1.61891e+06	test-rmse:2.52188e+06
Performance does not improve from 450 rounds


In [34]:
model = xgb.train(params,dtrain,num_boost_round=len(xgb_cvalid))

In [36]:
feature_importance = get_feature_importance(model)

In [39]:
features = list(feature_importance['Feature'])

## Fit model on scored features only

In [40]:
dtrain = xgb.DMatrix(train[features],label)
dtest = xgb.DMatrix(test[features])

In [41]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:7.79522e+06	test-rmse:7.79928e+06
[50]	train-rmse:2.39988e+06	test-rmse:2.76922e+06
[100]	train-rmse:2.07451e+06	test-rmse:2.60385e+06
[150]	train-rmse:1.96488e+06	test-rmse:2.5683e+06
[200]	train-rmse:1.87885e+06	test-rmse:2.55077e+06
[250]	train-rmse:1.81153e+06	test-rmse:2.54036e+06
[300]	train-rmse:1.75277e+06	test-rmse:2.53154e+06
[350]	train-rmse:1.70301e+06	test-rmse:2.53002e+06
[400]	train-rmse:1.65549e+06	test-rmse:2.52552e+06
[450]	train-rmse:1.61311e+06	test-rmse:2.52289e+06
[500]	train-rmse:1.57576e+06	test-rmse:2.5228e+06
Performance does not improve from 489 rounds
