In [18]:
%matplotlib inline
import xgboost as xgb
from sklearn.decomposition import FastICA
import matplotlib.pyplot as plt
from ggplot import *
import pandas as pd
import numpy as np

## Load data

In [33]:
train = pd.read_csv('train_all_features.csv')
test = pd.read_csv('test_all_features.csv')
label = train['price_doc']

In [34]:
strong_features = ['full_sq',
 'life_sq',
 'extra_area',
 'floor',
 'build_year',
 'max_floor',
 'rel_kitch_sq',
 'micex_cbi_tr',
 'eurrub',
 'kindergarten_km',
 'kitch_sq',
 'micex_rgbi_tr',
 'tsne_1',
 'floor_ratio',
 'state',
 'room_avg_size',
 'hospice_morgue_km',
 'num_room',
 'railroad_km',
 'room_size',
 'balance_trade',
 'school_km',
 'mosque_km',
 'public_healthcare_km',
 'tsne_2',
 'radiation_km',
 'green_zone_km',
 'public_transport_station_km',
 'metro_min_avto',
 'cemetery_km']

In [35]:
features_all = list(test.columns)[2:]
features_all.remove('tsne_1')
features_all.remove('tsne_2')
strong_features.remove('tsne_1')
strong_features.remove('tsne_2')

In [36]:
other_feature = [each for each in features_all if each not in strong_features]

In [37]:
train = train[other_feature]
test = test[other_feature]

## Do ICA on other features 

In [38]:
ICA_train = train[other_feature]
ICA_test = test[other_feature]
ICA_train = ICA_train.fillna(-999)
ICA_test = ICA_test.fillna(-999)

In [39]:
n_comp = 10
ica = FastICA(n_components=n_comp, random_state=42)
ica_results_train = ica.fit_transform(ICA_train)
ica_results_test = ica.transform(ICA_test)

In [40]:
for i in range(1, n_comp+1):
    train['ica_' + str(i)] = ica_results_train[:,i-1]
    test['ica_' + str(i)] = ica_results_test[:, i-1]

## Train xgboost on new feature

In [41]:
label = 0.969*label + 10

In [43]:
dtrain = xgb.DMatrix(train,label)
dtest = xgb.DMatrix(test)

In [70]:
params = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 3,## Try 4,5,6
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'lambda':10,
    'min_child_weight':7
}

xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=50,
    verbose_eval=50, show_stdv=False,seed=42)
xgb_cvalid[['train-rmse-mean', 'test-rmse-mean']].plot()
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmse:7.9605e+06	test-rmse:7.96182e+06
[50]	train-rmse:2.84723e+06	test-rmse:2.96418e+06
[100]	train-rmse:2.49772e+06	test-rmse:2.68116e+06
[150]	train-rmse:2.3738e+06	test-rmse:2.59795e+06
[200]	train-rmse:2.29243e+06	test-rmse:2.55567e+06
[250]	train-rmse:2.23165e+06	test-rmse:2.52721e+06
[300]	train-rmse:2.18204e+06	test-rmse:2.5037e+06
[350]	train-rmse:2.13918e+06	test-rmse:2.48484e+06
[400]	train-rmse:2.10472e+06	test-rmse:2.47269e+06
[450]	train-rmse:2.07212e+06	test-rmse:2.46274e+06
[500]	train-rmse:2.04301e+06	test-rmse:2.45545e+06
[550]	train-rmse:2.01395e+06	test-rmse:2.44571e+06
[600]	train-rmse:1.98775e+06	test-rmse:2.43912e+06
[650]	train-rmse:1.96392e+06	test-rmse:2.43234e+06
[700]	train-rmse:1.94099e+06	test-rmse:2.42764e+06
[750]	train-rmse:1.91888e+06	test-rmse:2.42313e+06


KeyboardInterrupt: 

In [57]:
model_ica = xgb.train(params,dtrain,num_boost_round=450)

In [59]:
pred2 = model_ica.predict(dtest)

In [46]:
pred = model_ica.predict(dtest)

In [52]:
best = pd.read_csv('sub.csv')['price_doc']

In [53]:
def mean_error(line1,line2):
    error = [x-y for x,y in zip(line1,line2)]
    return np.mean(error)

In [67]:
mean_error(best,AVeraged)

-1936710.9958317019

In [68]:
AVeraged

array([  4947072.5,   9062619. ,   4766576. , ...,   7028671.5,
         4369874. ,  11795748. ], dtype=float32)