In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn import preprocessing
from sklearn.cross_validation import KFold

from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')

In [3]:
cat_feats = train.select_dtypes(include=["object"]).columns

for feat in cat_feats:
    train[feat + '_id'] = preprocessing.LabelEncoder().fit_transform(train[feat].values)

In [4]:
num_feats = [feat for feat in train.columns if 'cont' in feat]
id_feats  = [feat for feat in train.columns if '_id' in feat]

X = train[num_feats + id_feats].values
y = train['loss'].values

In [8]:
model = xgb.XGBRegressor(
    max_depth = 12,
    learning_rate = 0.2,
    n_estimators = 3,
    silent = 0,
    objective = 'reg:linear',
    nthread = -1,
    # gamma = 5290.,
    # min_child_weight = 4.2922,
    subsample = 0.6,
    colsample_bytree = 0.6,
    seed = 2017
)

In [17]:
nfolds = 3
folds = KFold(len(y), n_folds=nfolds, shuffle = True, random_state = 2017)


for num_iter, (train_index, test_index) in enumerate(folds):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
    
    model.fit(X_train, y_train,
       eval_metric='mae',
       eval_set=[(X[train_index], y[train_index]), (X[test_index], y[test_index])],
       verbose=True)
    y_pred = model.predict(X_test)
    y_pred[y_pred<0] = 0
    
    score = mean_absolute_error(y_test, y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))

[0]	validation_0-mae:2445.53	validation_1-mae:2457.49
[1]	validation_0-mae:1998.29	validation_1-mae:2018.68
[2]	validation_0-mae:1684.17	validation_1-mae:1718.06
[3]	validation_0-mae:1472.59	validation_1-mae:1523.3
[4]	validation_0-mae:1340.96	validation_1-mae:1407.47
[5]	validation_0-mae:1250.06	validation_1-mae:1333.96
[6]	validation_0-mae:1194.43	validation_1-mae:1292.69
[7]	validation_0-mae:1153.1	validation_1-mae:1262.62
[8]	validation_0-mae:1124.19	validation_1-mae:1246.91
[9]	validation_0-mae:1101.67	validation_1-mae:1237.68
[10]	validation_0-mae:1083.74	validation_1-mae:1232.83
[11]	validation_0-mae:1069.49	validation_1-mae:1230.73
[12]	validation_0-mae:1057.9	validation_1-mae:1229.59
[13]	validation_0-mae:1047.65	validation_1-mae:1229.64
[14]	validation_0-mae:1039.6	validation_1-mae:1229.57
[15]	validation_0-mae:1030.16	validation_1-mae:1229.64
[16]	validation_0-mae:1021.05	validation_1-mae:1230.18
[17]	validation_0-mae:1014.16	validation_1-mae:1230
[18]	validation_0-mae:1008.

## Task

One cell above there's a model wich use y like a target variable.
Modeify the code in order to use transformed targert variable by logarithm...


some TIPS:
1. y_log_train = np.log(y_train)
2. model.fit(X_train, y_log_train, ...
3. y_log_pred = model.predict(X_test)
4. y_pred = np.exp(y_log_pred)
