In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt



In [2]:
def data_with_val():
    print('generating data......')
    # read datasets
    train_total = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

    # Shuffle data
    np.random.seed(0)
    l = [x for x in range(4209)]
    np.random.shuffle(l)
    train_total = train_total.iloc[l]

    # split data
    ratio = 0.5
    threshold = int(ratio*4209)
    X_train = train_total.iloc[range(threshold)] 
    val = train_total.iloc[range(threshold, 4209)]
    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)
    y_val = val['y']
    X_val = val.drop('y', axis = 1)

    # process type
    for c in train_total.columns:
        if train_total[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(train_total[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_val[c] = lbl.transform(list(X_val[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    print('Shape X_val:', X_val.shape )
    return X_train, y_train, X_val, y_val, X_test

def data():
    print('generating data......')
    # read datasets
    X_train = pd.read_csv('../data/train.csv') ## Shape train: (4209, 378)
    X_test = pd.read_csv('../data/test.csv') ## Shape test: (4209, 377)

#     # Shuffle data
#     l = [x for x in range(4209)]
#     np.random.seed(0)
#     np.random.shuffle(l)
#     X_train = X_train.iloc[l]

    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)

    # process type
    for c in X_train.columns:
        if X_train[c].dtype == 'object':
            lbl = LabelEncoder() 
            lbl.fit(list(X_train[c].values) + list(X_test[c].values)) 
            X_train[c] = lbl.transform(list(X_train[c].values))
            X_test[c] = lbl.transform(list(X_test[c].values))

    # shape        
    print('Shape X_train:', X_train.shape)
    print('Shape X_test:', X_test.shape)
    return X_train, y_train, X_test

In [3]:
X_train,y_train, X_val,y_val, X_test = data_with_val()
#X_train,y_train, X_test = data()
#X_val,y_val = X_train[:],y_train[:]

generating data......
('Shape X_train:', (2104, 377))
('Shape X_test:', (4209, 377))
('Shape X_val:', (2105, 377))


In [4]:
# Gloable data

ID = 'ID'
TARGET = 'y'
NFOLDS = 4
SEED = 1
NROWS = None
DATA_DIR = "../data"
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)
X_test = np.array(X_test)

SEED = 0
ntrain, D = X_train.shape
nval = X_val.shape[0]
ntest = X_test.shape[0]
NFOLDS = 4

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [5]:

class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X):
        return self.clf.predict(X)
    

class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        params = params.copy() # creat a local version of params
        self.nrounds = params.pop('nrounds', 250)

    def train(self, X_train, y_train):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, X):
        return self.gbdt.predict(xgb.DMatrix(X))
    

def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    oof_val = np.zeros((nval,))
    oof_val_skf = np.empty((NFOLDS, nval))

    for i, (train_index, test_index) in enumerate(kf):
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[test_index]

        clf.train(X_tr, y_tr)

        oof_train[test_index] = clf.predict(X_te)
        oof_test_skf[i, :] = clf.predict(X_test)
        oof_val_skf[i, :] = clf.predict(X_val)
    oof_test[:] = oof_test_skf.mean(axis=0)
    oof_val[:] = oof_val_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_val.reshape(-1, 1), oof_test.reshape(-1, 1)

In [6]:
def turing_xgb(N):
    best_err = 100000000
    data = []
    for _ in range(N):
        params = {
            'colsample_bytree': np.random.uniform(0.01,1),
            'subsample': np.random.uniform(0.01,1),
            'learning_rate': np.exp(np.random.uniform(np.log(0.001),np.log(0.1))),
            'objective': 'reg:linear',
            'max_depth': int(np.random.uniform(1,10)),
            'num_parallel_tree': int(np.random.uniform(1,3)),
            'min_child_weight': int(np.random.uniform(1,5)),
            'nrounds': int(np.random.uniform(300,800))
        }
        model = XgbWrapper(seed=SEED, params=params)
        oof_train, oof_val ,oof_test = get_oof(model)
        err = mean_squared_error(y_train, oof_train)
        if best_err > err:
            best_err = err
            best_para = params
        print(err,best_err, params)
        data += [(params,err, oof_train, oof_val, oof_test)]

    pickle.dump(data,open("xgb.p",'wb'))    
    
def turing(clf,N,name):
    best_err = 100000000
    data = []
    for _ in range(N):
        n_estimators = int(np.random.uniform(1,900))
        max_features = np.random.uniform(0.01,1)
        max_depth = int(np.random.uniform(1,10))
        min_samples_leaf = int(np.random.uniform(1,7))
        params = {
            'n_jobs': 8,
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
        }
        model = SklearnWrapper(clf=clf, seed=SEED, params=params)
        oof_train, oof_val ,oof_test = get_oof(model)
        err = mean_squared_error(y_train, oof_train)
        if best_err > err:
            best_err = err
            best_para = params
        print(err,best_err, params)
        data += [(params,err,oof_train, oof_val, oof_test)]

    pickle.dump(data,open("{}.p".format(name),'wb'))    

In [7]:
N = 100
turing(RandomForestRegressor,N,'rf')
turing(ExtraTreesRegressor,N,'et')
turing_xgb(N)

(100.21713377336098, 100.21713377336098, {'n_jobs': 8, 'min_samples_leaf': 5, 'n_estimators': 286, 'random_state': 0, 'max_features': 0.024499432235574173, 'max_depth': 5})
(76.199222999935003, 76.199222999935003, {'n_jobs': 8, 'min_samples_leaf': 3, 'n_estimators': 811, 'random_state': 0, 'max_features': 0.7649142272517786, 'max_depth': 5})
(81.929948152242801, 76.199222999935003, {'n_jobs': 8, 'min_samples_leaf': 1, 'n_estimators': 502, 'random_state': 0, 'max_features': 0.8861720323891933, 'max_depth': 8})
(77.244681393984877, 76.199222999935003, {'n_jobs': 8, 'min_samples_leaf': 6, 'n_estimators': 606, 'random_state': 0, 'max_features': 0.16160041019820734, 'max_depth': 8})
(77.271056158471509, 76.199222999935003, {'n_jobs': 8, 'min_samples_leaf': 1, 'n_estimators': 777, 'random_state': 0, 'max_features': 0.2976875139014636, 'max_depth': 6})
(76.10457496427118, 76.10457496427118, {'n_jobs': 8, 'min_samples_leaf': 4, 'n_estimators': 877, 'random_state': 0, 'max_features': 0.41532711

# Second Layer

In [8]:
xgb_data = pickle.load(open('xgb.p','rb'))
et_data = pickle.load(open('et.p','rb'))
rf_data = pickle.load(open('rf.p','rb'))
L = 30
N = 3
X_val_bar = np.zeros([2105, L*N])
X_test_bar = np.zeros([4209, L*N])
id_col = (i for i in range(L*N)) 
for dt in sorted(et_data, key =  lambda x: x[1])[:L]:
    j = next(id_col)
    X_val_bar[:,j] = np.squeeze(dt[3])
    X_test_bar[:,j] = np.squeeze(dt[4])

for dt in sorted(rf_data, key =  lambda x: x[1])[:L]:
    j = next(id_col)
    X_val_bar[:,j] = np.squeeze(dt[3])
    X_test_bar[:,j] = np.squeeze(dt[4])
for dt in sorted(xgb_data, key =  lambda x: x[1])[:L]:
    j = next(id_col)
    X_val_bar[:,j] = np.squeeze(dt[3])
    X_test_bar[:,j] = np.squeeze(dt[4])

In [9]:
def get_oof_2nd_layer(clf,X_train,y_train, X_val):
    ntrain = X_train.shape[0]
    nval = X_val.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_val = np.zeros((nval,))
    oof_val_skf = np.empty((NFOLDS, nval))
    kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
    for i, (train_index, test_index) in enumerate(kf):
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[test_index]
        clf.train(X_tr, y_tr)
        oof_train[test_index] = clf.predict(X_te)
        oof_val_skf[i, :] = clf.predict(X_val)
    oof_val[:] = oof_val_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_val.reshape(-1, 1)

def turing_2nd_layer(clf,N):
    best_err = 100000000
    data = []
    for _ in range(N):
        n_estimators = int(np.random.uniform(1,900))
        max_features = np.random.uniform(0.01,1)
        max_depth = int(np.random.uniform(1,10))
        min_samples_leaf = int(np.random.uniform(1,7))
        params = {
            'n_jobs': 8,
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
        }
        model = SklearnWrapper(clf=clf, seed=SEED, params=params)
        oof_val, oof_test = get_oof_2nd_layer(model, X_val_bar, y_val, X_test_bar)
        err = mean_squared_error(y_val, oof_val)
        if best_err > err:
            best_err = err
            best_para = params
            best_y = oof_test
        print(err,best_err, params)
    #pickle.dump(best_para,open("2nd_layer_para.p",'wb'))
    return best_y

In [10]:
best_y = turing_2nd_layer(RandomForestRegressor, 40)

(63.237000186570711, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 3, 'n_estimators': 236, 'random_state': 0, 'max_features': 0.05511468941217113, 'max_depth': 6})
(63.809119275198931, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 3, 'n_estimators': 274, 'random_state': 0, 'max_features': 0.3270563169499595, 'max_depth': 7})
(63.674965301769006, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 4, 'n_estimators': 447, 'random_state': 0, 'max_features': 0.06155342550859197, 'max_depth': 7})
(63.328195045889267, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 1, 'n_estimators': 716, 'random_state': 0, 'max_features': 0.5476909021001328, 'max_depth': 3})
(63.956392400755909, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 4, 'n_estimators': 461, 'random_state': 0, 'max_features': 0.9548000436047506, 'max_depth': 2})
(64.114115347578846, 63.237000186570711, {'n_jobs': 8, 'min_samples_leaf': 5, 'n_estimators': 519, 'random_state': 0, 'max_features': 0.2837422

In [11]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = best_y
submission.to_csv('stacking_sub.csv', index=None)