In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
class StackingEstimator(BaseEstimator, TransformerMixin):

    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed



In [3]:
import pickle
train = pickle.load(open('train.pkl', 'rb'))
test = pickle.load(open('test.pkl', 'rb'))
public_lb = pd.read_csv('../../data/public_lb.csv')
public_lb['ID'] = public_lb.id
public_lb['y'] = public_lb.yValue

In [4]:
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
usable_columns=['ID', 'X0', 'X47','X95','X314','X315','X232','X119','X311','X76','X329','X238','X340','X362','X137']

finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

In [5]:
base_xgb_params = {
    'n_trees': 520,
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [36]:
def build_model(tr_X, eval_X, tr_y, eval_y):
    xgb_params = {
            'n_trees': 520,
            'eta': 0.05,
            'max_depth': 5,
            'subsample': 0.93,
            'colsample_bytree': 0.7,
            'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'silent': 1,
        }
    xgb_params = base_xgb_params

    cv_dtrain = xgb.DMatrix(tr_X.values, tr_y)
    cv_dtest = xgb.DMatrix(eval_X.values)
    
    num_boost_rounds = 1250
    stacked_pipeline = make_pipeline(
        StackingEstimator(estimator=LassoLarsCV(normalize=True)),
        StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
        LassoLarsCV()
    )
    cv_finaltrainset = tr_X[usable_columns].values
    cv_finaltestset = eval_X[usable_columns].values
    cv_xgb = xgb.train(dict(xgb_params, silent=0), cv_dtrain, num_boost_round=num_boost_rounds)
    stacked_pipeline.fit(cv_finaltrainset, cv_y)
    y_pred = cv_xgb.predict(cv_dtest)
    results = stacked_pipeline.predict(cv_finaltestset)
    return r2_score(eval_y, y_pred), r2_score(eval_y, results)

In [38]:
res_r2 = []
res_rmse = []
from sklearn.model_selection import train_test_split
for _ in range(5):
    cv_X, eval_X, cv_y, eval_y  = train_test_split(train.drop('y', axis=1), y_train, random_state = 420 )
    r = build_model(cv_X, eval_X, cv_y, eval_y)
    print(r)
    res_r2.append(r[0])
    res_rmse.append(r[1])

(0.61739806521023599, 0.63055545785738842)
(0.61739806521023599, 0.6265036872639409)
(0.61739806521023599, 0.62991743918762522)
(0.61739806521023599, 0.62475437133523948)
(0.61739806521023599, 0.63075342382945943)


In [48]:
x = 0
for r in range(len(res_r2)):
    x = x + (res_r2[r] * 0.7145 + res_rmse[r]*0.2855)
print(x/len(res_r2))

0.620566775661


In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
stacked_pipeline.fit(finaltrainset, y_train)

In [None]:
y_pred = model.predict(dtest)
results = stacked_pipeline.predict(finaltestset)

In [None]:
print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

In [None]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.7145 + results*0.2855
res = sub[sub.ID.isin(public_lb.id)]
result = r2_score(public_lb.yValue, res.y)
print(result)
exception = True

Base Results = 0.720429268928

In [None]:
first = sub[sub.ID.isin(public_lb.id) == False]
second = public_lb[['ID', 'y']]
sub2 = first.append(second, ignore_index=True)
sub2 = sub2.sort_values('ID', ascending=True)

In [None]:
sanity_check= sub2[pd.isnull(sub2.y)].shape[0]
print(sanity_check == 0)
print(sub2.shape[0] == test.shape[0])
if sanity_check == 0 and sub2.shape[0] == test.shape[0] and (result + 0.001 > 0.720429268928 or exception):
    sub.to_csv('stacked-models7.csv', index=False)
    print("saved")
else:
    print("below can't continue")