# packages

In [3]:
#import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.display.max_columns = 200
pd.options.display.max_rows = 1000

#model
import xgboost as xgb

# Data

In [19]:
trainX = pd.read_csv("../Data/newXTrain.csv",header=0)
testX = pd.read_csv("../Data/newXTest.csv",header=0)
y_train_log = pd.read_csv("../Data/y_train_log.csv",header=None).as_matrix()
y_test_log = pd.read_csv("../Data/y_test_log.csv",header=None).as_matrix()

# Evaluation

In [13]:
def eva(test_pred, test_real):
    return ((test_real-test_pred)**2).mean()

## xgboost with tree booster

In [27]:
def testTrans(X_transTrain, X_transTest, y_train_log = y_train_log, y_test = y_test_log):
    
    """
    This function is for test accuracy of gradient boosting tree after Feature engineering
    """
    # add weights to the predictions
#     define loss function
#     def mse(preds, dtrain):
#         labels = dtrain.get_label()
#         if labels > 10:
#             grad = (preds - labels)*labels
#             hess = labels
#             return grad, hess
#         else: 
#             grad = preds - labels
#             hess = np.array([1]*len(grad))
#             return grad, hess

    RANDOM_STATE = 42
    params = {
        'min_child_weight': 1,
        'eta': 0.01,
        'colsample_bytree': 1,
        'max_depth': 12,
        'subsample': 0.2,
        'reg_alpha': 1,
        'gamma': 0.04,
        'silent':True,
        "verbose_eval":10,
        "eval_metric":"rmse",
        'seed': RANDOM_STATE
    }

    
    # data preparation and model training 
    xgtrain = xgb.DMatrix(X_transTrain, label=y_train_log)
    xgval = xgb.DMatrix(X_transTest, label=y_test_log)
    xgtest = xgb.DMatrix(X_transTest)
    gb_model = xgb.train(params, 
                         dtrain=xgtrain, 
                         evals=[(xgval,"validation")], 
                         early_stopping_rounds = 30,
                         num_boost_round = 2000)
    
    # model prediction and evaluation
    gb_pred = gb_model.predict(xgb.DMatrix(X_transTest))
    return eva(gb_pred, y_test_log)

In [28]:
testTrans(X_transTrain=trainX, X_transTest=testX)

[0]	validation-rmse:1.04707
Will train until validation-rmse hasn't improved in 30 rounds.
[1]	validation-rmse:1.04095
[2]	validation-rmse:1.03498
[3]	validation-rmse:1.02912
[4]	validation-rmse:1.02339
[5]	validation-rmse:1.01774
[6]	validation-rmse:1.01205
[7]	validation-rmse:1.00655
[8]	validation-rmse:1.0011
[9]	validation-rmse:0.995706
[10]	validation-rmse:0.990372
[11]	validation-rmse:0.985117
[12]	validation-rmse:0.980045
[13]	validation-rmse:0.975065
[14]	validation-rmse:0.969824
[15]	validation-rmse:0.965157
[16]	validation-rmse:0.960279
[17]	validation-rmse:0.955166
[18]	validation-rmse:0.950226
[19]	validation-rmse:0.945672
[20]	validation-rmse:0.940963
[21]	validation-rmse:0.936418
[22]	validation-rmse:0.931846
[23]	validation-rmse:0.927024
[24]	validation-rmse:0.922655
[25]	validation-rmse:0.918448
[26]	validation-rmse:0.914216
[27]	validation-rmse:0.910002
[28]	validation-rmse:0.905697
[29]	validation-rmse:0.901403
[30]	validation-rmse:0.897316
[31]	validation-rmse:0.8934

1.7841574467661661

## xgboost with linear booster

In [None]:
def testLin(X_transTrain, X_transTest, y_train_log = y_train_log, y_test = y_test_log):
    
    """
    This function is for test accuracy of gradient boosting tree after Feature engineering
    """
    # add weights to the predictions
#     define loss function
#     def mse(preds, dtrain):
#         labels = dtrain.get_label()
#         if labels > 10:
#             grad = (preds - labels)*labels
#             hess = labels
#             return grad, hess
#         else: 
#             grad = preds - labels
#             hess = np.array([1]*len(grad))
#             return grad, hess

    RANDOM_STATE = 42
    params = {

        'colsample_bytree': 1,
        'max_depth': 12,
        'subsample': 0.2,
        'reg_alpha': 1,
        'gamma': 0.04,
        'silent':True,
        "verbose_eval":10,
        "eval_metric":"rmse",
        'seed': RANDOM_STATE
    }

    
    # data preparation and model training 
    xgtrain = xgb.DMatrix(X_transTrain, label=y_train_log)
    xgval = xgb.DMatrix(X_transTest, label=y_test_log)
    xgtest = xgb.DMatrix(X_transTest)
    gb_model = xgb.train(params, 
                         dtrain=xgtrain, 
                         evals=[(xgval,"validation")], 
                         early_stopping_rounds = 30,
                         num_boost_round = 2000)
    
    # model prediction and evaluation
    gb_pred = gb_model.predict(xgb.DMatrix(X_transTest))
    return eva(gb_pred, y_test_log)

# Ensemble with boosting

In [None]:
from sklearn.cross_validation import KFold
## cv-folds
nfolds = 10
folds = KFold(len(y_train_log), n_folds = nfolds, shuffle = True, random_state = 111)

nepochs = 55
model_group = []
pred = []

early_stopping = EarlyStopping(monitor='val_loss', patience=20)

for (Tr, Te) in folds:
    train_x = X_train[Tr] 
    train_y = y_train[Tr]
    test_x = X_train[Te]
    test_y = y_train[Te]
    train_y_log = np.log(train_y+200)
    
    model = NN()
    # model will stop if overfit on the test data
    model.fit(train_x, train_y_log, validation_data = (test_x,test_y), nb_epoch=nepochs, batch_size=300,
             callbacks=[early_stopping])
    y_test_pred_log = model.predict(X_test)
    y_test_pred_log = y_test_pred_log.reshape([y_test_pred_log.shape[0],])
    y_test_pred = np.exp(y_test_pred_log)-200
    pred.append(y_test_pred)
    model_group.append(model)