In [51]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [46]:
def data():
    print('generating data......')
    # read datasets
    X_train = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    l = [x for x in range(4209)]
    np.random.shuffle(l)
    X_train = X_train.iloc[l]

    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)

    # process type
    for c in X_train.columns:
        if X_train[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(X_train[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    return X_train, y_train, X_test


def turn():
    boost = xgb.XGBRegressor()
    print('trunning model.....')
    parameters = {'learning_rate': [0.005],
                  'gamma': [0,0.5],
                  'max_depth': [4, 9],
                  'min_child_weight': [1,5],
                  "subsample": [0.6,1],
                  'colsample_bytree': [0.6,1],
                 }
    reg = RandomizedSearchCV(boost, parameters, n_jobs=8, cv=3, verbose = 1)
    reg.fit(X_train, y_train)
    best_parameters, score, _ = max(reg.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
    pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))
    return reg.best_params_

In [3]:
X_train,y_train, X_test = data()

generating data......
Shape X_train: (4209, 377)
Shape X_test: (4209, 377)


In [47]:
ID = 'ID'
TARGET = 'y'
NFOLDS = 4
SEED = 0
NROWS = None
DATA_DIR = "../data"

TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
TEST_FILE = "{0}/test.csv".format(DATA_DIR)
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

x_train = np.array(X_train)
x_test = np.array(X_test)
y_train = np.array(y_train)

SEED = 0
ntrain, D = X_train.shape
ntest, _ = X_test.shape
NFOLDS = 4

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [48]:

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    

def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    print("get_oof, done!")
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'nrounds': 300
}



In [49]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)

xg_oof_train, xg_oof_test = get_oof(xg)
print("XG-CV: {}".format(mean_squared_error(y_train, xg_oof_train)))
et_oof_train, et_oof_test = get_oof(et)
print("ET-CV: {}".format(mean_squared_error(y_train, et_oof_train)))
rf_oof_train, rf_oof_test = get_oof(rf)
print("RF-CV: {}".format(mean_squared_error(y_train, rf_oof_train)))

get_oof, done!
get_oof, done!
get_oof, done!


In [30]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test), axis=1)
print("{},{}".format(x_train.shape, x_test.shape))

(4209, 3),(4209, 3)


In [40]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=700, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

[0]	train-rmse:99.9674+0.104366	test-rmse:99.9671+0.31353
[10]	train-rmse:90.4999+0.0943104	test-rmse:90.5017+0.305257
[20]	train-rmse:81.9389+0.0860536	test-rmse:81.9427+0.299534
[30]	train-rmse:74.2075+0.080076	test-rmse:74.2131+0.295443
[40]	train-rmse:67.2219+0.0710553	test-rmse:67.2297+0.295415
[50]	train-rmse:60.912+0.0650814	test-rmse:60.9213+0.297923
[60]	train-rmse:55.2149+0.0578099	test-rmse:55.2262+0.302622
[70]	train-rmse:50.0726+0.0560183	test-rmse:50.085+0.30253
[80]	train-rmse:45.4337+0.0569913	test-rmse:45.4458+0.306753
[90]	train-rmse:41.2551+0.0538111	test-rmse:41.2684+0.311885
[100]	train-rmse:37.49+0.0529563	test-rmse:37.5038+0.315789
[110]	train-rmse:34.1014+0.0564855	test-rmse:34.1167+0.316658
[120]	train-rmse:31.0496+0.0564578	test-rmse:31.0684+0.323856
[130]	train-rmse:28.3088+0.0574787	test-rmse:28.3321+0.331707
[140]	train-rmse:25.8519+0.0600282	test-rmse:25.88+0.337639
[150]	train-rmse:23.6505+0.0613297	test-rmse:23.6862+0.339524
[160]	train-rmse:21.6793+0.06

In [33]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [37]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
submission.to_csv('xgstacker_starter.sub.csv', index=None)