In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor



In [2]:
def data():
    print('generating data......')
    # read datasets
    X_train = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    l = [x for x in range(4209)]
    np.random.shuffle(l)
    X_train = X_train.iloc[l]

    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)

    # process type
    for c in X_train.columns:
        if X_train[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(X_train[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    return X_train, y_train, X_test


def turn():
    boost = xgb.XGBRegressor()
    print('trunning model.....')
    parameters = {'learning_rate': [0.005],
                  'gamma': [0,0.5],
                  'max_depth': [4, 9],
                  'min_child_weight': [1,5],
                  "subsample": [0.6,1],
                  'colsample_bytree': [0.6,1],
                 }
    reg = RandomizedSearchCV(boost, parameters, n_jobs=8, cv=3, verbose = 1)
    reg.fit(X_train, y_train)
    best_parameters, score, _ = max(reg.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
    pickle.dump( reg.best_params_, open("bestpara.p", "wb" ))
    return reg.best_params_

In [3]:
X_train,y_train, X_test = data()

generating data......
Shape X_train: (4209, 377)
Shape X_test: (4209, 377)


In [4]:
ID = 'ID'
TARGET = 'y'
NFOLDS = 4
SEED = 0
NROWS = None
DATA_DIR = "../data"

TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
TEST_FILE = "{0}/test.csv".format(DATA_DIR)
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

x_train = np.array(X_train)
x_test = np.array(X_test)
y_train = np.array(y_train)

SEED = 0
ntrain, D = X_train.shape
ntest, _ = X_test.shape
NFOLDS = 4

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [5]:

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [17]:
def turing(clf):
    best_err = 100000000
    data = []
    for _ in range(2):
        n_estimators = int(np.random.uniform(0,900))
        max_features = np.random.uniform(0,1)
        max_depth = int(np.random.uniform(1,10))
        min_samples_leaf = int(np.random.uniform(1,7))
        params = {
            'n_jobs': 16,
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
        }
        model = SklearnWrapper(clf=clf, seed=SEED, params=params)
        oof_train, oof_test = get_oof(model)
        err = mean_squared_error(y_train, oof_train)
        if best_err > err:
            best_err = err
            best_para = et_params
        print(err,best_err, params)
        data += [(params,err)]

    pickle.dump(data,open("{}.p".format(ExtraTreesRegressor),'wb'))    

In [18]:
turing()

70.6114166261 70.6114166261 {'n_jobs': 16, 'n_estimators': 222, 'max_features': 0.37109603499761, 'max_depth': 8, 'min_samples_leaf': 3, 'random_state': 0}
70.2147482479 70.2147482479 {'n_jobs': 16, 'n_estimators': 737, 'max_features': 0.230193291236094, 'max_depth': 7, 'min_samples_leaf': 4, 'random_state': 0}


In [19]:
pickle.load( open( "<class 'sklearn.ensemble.forest.ExtraTreesRegressor'>.p", "rb" ) )

[({'max_depth': 8,
   'max_features': 0.37109603499761,
   'min_samples_leaf': 3,
   'n_estimators': 222,
   'n_jobs': 16,
   'random_state': 0},
  70.611416626103335),
 ({'max_depth': 7,
   'max_features': 0.230193291236094,
   'min_samples_leaf': 4,
   'n_estimators': 737,
   'n_jobs': 16,
   'random_state': 0},
  70.214748247858381)]