In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor





In [4]:
xgb_data = pickle.load(open('xgb.p','rb'), encoding='latin1')
et_data = pickle.load(open('et.p','rb'), encoding='latin1')
rf_data = pickle.load(open('rf.p','rb'), encoding='latin1')

L = 1
N = 3
X = np.zeros([2525, L*N])
id_col = (i for i in range(L*N))
for dt in sorted(et_data, key =  lambda x: x[1])[:L]:
    X[:,next(id_col)] = np.squeeze(dt[2])
for dt in sorted(rf_data, key =  lambda x: x[1])[:L]:
    X[:,next(id_col)] = np.squeeze(dt[2])    
for dt in sorted(xgb_data, key =  lambda x: x[1])[:L]:
    X[:,next(id_col)] = np.squeeze(dt[2])

array([[ 103.04624868,  102.02963133,  100.99092102],
       [  94.29067739,   94.27512405,   93.31434631],
       [ 112.50139923,  113.18660559,  111.1175766 ],
       ..., 
       [  76.72775424,   76.96510044,   77.38584137],
       [  93.82145784,   93.75591305,   92.98980713],
       [  94.20093244,   95.40415694,   96.18862915]])

In [152]:
def data():
    print('generating data......')
    # read datasets
    X_train = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    #l = [x for x in range(4209)]
    #np.random.shuffle(l)
    #X_train = X_train.iloc[l]

    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)

    # process type
    for c in X_train.columns:
        if X_train[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(X_train[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    return X_train, y_train, X_test

In [153]:
_,y,_ = data()

generating data......
Shape X_train: (4209, 377)
Shape X_test: (4209, 377)


In [92]:
print(X.shape,y.shape)

ntrain, D = X.shape
ntest = ntrain
NFOLDS = 4
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

(4209, 15) (4209,)


In [95]:

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    

def get_oof(clf,x_train,y_train):
    oof_train = np.zeros((ntrain,))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        clf.train(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
    return oof_train.reshape(-1, 1)

In [119]:
def turing(clf,N,name):
    best_err = 100000000
    data = []
    for _ in range(N):
        n_estimators = int(np.random.uniform(1,5))
        max_features = np.random.uniform(.3,1)
        max_depth = int(np.random.uniform(1,4))
        min_samples_leaf = int(np.random.uniform(1,4))
        params = {
            'n_jobs': 8,
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
        }
        model = SklearnWrapper(clf=clf, seed=SEED, params=params)
        oof_train = get_oof(model,X,y)
        err = mean_squared_error(y, oof_train)
        if best_err > err:
            best_err = err
            best_para = params
        print(err,best_err, params)
        data += [(params,err,oof_train)]

    pickle.dump(data,open("{}.p".format(name),'wb'))    

In [120]:
N = 40
SEED = 0
turing(ExtraTreesRegressor,N,'stacking')

160.713749202 160.713749202 {'n_jobs': 8, 'n_estimators': 2, 'max_features': 0.7609474949924904, 'max_depth': 2, 'min_samples_leaf': 1, 'random_state': 0}
161.025558534 160.713749202 {'n_jobs': 8, 'n_estimators': 1, 'max_features': 0.46777239047970487, 'max_depth': 2, 'min_samples_leaf': 1, 'random_state': 0}
161.054009083 160.713749202 {'n_jobs': 8, 'n_estimators': 1, 'max_features': 0.9315722168678309, 'max_depth': 3, 'min_samples_leaf': 3, 'random_state': 0}
160.840286741 160.713749202 {'n_jobs': 8, 'n_estimators': 3, 'max_features': 0.8792262513195204, 'max_depth': 1, 'min_samples_leaf': 1, 'random_state': 0}
161.032906289 160.713749202 {'n_jobs': 8, 'n_estimators': 3, 'max_features': 0.8305348515374193, 'max_depth': 3, 'min_samples_leaf': 3, 'random_state': 0}
160.818734901 160.713749202 {'n_jobs': 8, 'n_estimators': 3, 'max_features': 0.9062840305129642, 'max_depth': 2, 'min_samples_leaf': 2, 'random_state': 0}
160.885766209 160.713749202 {'n_jobs': 8, 'n_estimators': 1, 'max_fea

In [114]:
mean_squared_error(y, np.mean(X, axis=1))

252.9747323721036

In [124]:
ET_para = sorted(pickle.load(open('stacking.p','rb')),key = lambda x: x[1])[0][0]

In [126]:
ET = SklearnWrapper(ExtraTreesRegressor,params=ET_para)

In [127]:
ET.train(X,y)

In [129]:
y

3996    108.82
2722    108.54
1285    111.58
304      89.03
3249     87.83
65       84.15
2423    117.14
2944     99.26
938     117.89
1010    105.43
1186    116.00
1280     93.93
1374     91.62
1042     90.59
3755     91.77
3786    109.11
2068    111.08
2952     93.19
8       108.67
2111    107.11
1739    105.79
1270    106.09
4063    115.88
2050    112.07
3233    116.65
567      88.20
999     110.24
2039    124.64
793      96.51
2410    105.13
         ...  
2125     98.32
1962    114.80
2187     78.88
992     101.33
849     112.54
1271    111.31
1781     90.68
4031     99.38
354     115.50
1486     91.53
754     111.48
3795    107.37
1667    132.86
1544    111.42
19       90.81
1817     99.99
2146    109.10
2573     97.28
545      92.83
663     105.87
3498    109.13
1776    105.56
1301     76.18
1527     91.07
3120     92.13
4083     89.60
2693     94.85
2333     92.76
3427     89.28
3750     98.85
Name: y, Length: 4209, dtype: float64

In [140]:
mean_squared_error(X[:,1],y)

251.4159135534282

In [128]:
ET.predict(X)

array([ 101.03319275,  100.68383471,  100.45419462, ...,  100.40399861,
        101.03319275,  101.03319275])