In [51]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV, RANSACRegressor, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split




In [52]:

class StackingEstimator(BaseEstimator, TransformerMixin):

    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


In [93]:


train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
best_results = pd.read_csv('best_results.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12
random_state=42

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=random_state)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=random_state)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=random_state)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=random_state)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=random_state)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays)
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

In [135]:


'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520,
    'eta': 0.0045,
    'max_depth': 7,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
}
# NOTE: Make sure that the class is labeled 'class' in the data file

num_boost_rounds = 1250
# train model

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),    
#     StackingEstimator(RANSACRegressor(random_state=42)),
#     StackingEstimator(LinearRegression()),
    LassoLarsCV(),
)



In [136]:
from sklearn.svm import OneClassSVM
onv = OneClassSVM()
onv.fit(finaltrainset)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [137]:
pred = onv.predict(finaltrainset)
pred
finaltrainset2 = finaltrainset[pred == 1]
y_train2 = y_train[pred == 1]

In [138]:
res = []
for _ in range(5):
    print(_)
    train_X, eval_X, train_y, eval_y = train_test_split(finaltrainset2, y_train2, random_state=42)

    dtrain = xgb.DMatrix(train_X, train_y)
    dtest = xgb.DMatrix(eval_X)
    stacked_pipeline.fit(train_X, train_y)
    results = np.exp(stacked_pipeline.predict(train_X))

    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
    y_pred = model.predict(dtrain)
    '''R2 Score on the entire Train data when cross validation'''

    print('R2 score on train data:')
    print("train : ", r2_score(train_y,stacked_pipeline.predict(train_X)*0.2855 + model.predict(dtrain)*0.7145))
    r = r2_score(eval_y,stacked_pipeline.predict(eval_X)*0.2855 + model.predict(dtest)*0.7145)
#     r2s = r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145)
    print("test : ", r)
    res.append(r)
    print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-")
print(np.array(r).mean())

0
R2 score on train data:
train :  0.787595012822
test :  0.559587150016
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
1
R2 score on train data:
train :  0.787645699884
test :  0.559665293469
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
2
R2 score on train data:
train :  0.787503641054
test :  0.55966444972
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
3
R2 score on train data:
train :  0.787632946334
test :  0.559379566467
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
4
R2 score on train data:
train :  0.787575729956
test :  0.559794368846
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
0.559794368846


* 5 CV without outlier :  0.547635540729
* 5 CV with outlier : 0.540110524491
* 5 CV with less model and using outlier : 0.547530554253
* 5 CV with less model and without outlier : 0.56446305843

In [127]:
dtrain = xgb.DMatrix(train.drop('y', axis=1)[pred == 1], y_train[pred == 1])
dtest = xgb.DMatrix(test)

model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
stacked_pipeline.fit(finaltrainset[pred == 1], y_train[pred == 1])

Pipeline(steps=[('stackingestimator-1', StackingEstimator(estimator=LassoLarsCV(copy_X=True, cv=None, eps=2.2204460492503131e-16,
      fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=1,
      normalize=True, positive=False, precompute='auto', verbose=False))), ('stackingestimator-2', StackingEst...x_n_alphas=1000, n_jobs=1,
      normalize=True, positive=False, precompute='auto', verbose=False))])

In [133]:
y_pred = model.predict(dtrain)
results = stacked_pipeline.predict(finaltrainset[pred == 1])
print("train : ", r2_score(y_train[pred == 1],results*0.2855 + y_pred*0.7145))

train :  0.735429086847


* train without outlier :  0.747116600835
* train with outlier :  0.706698820141
* train with less model and without outlier :  0.735429086847

In [134]:
'''Average the preditionon test data  of both models then save it on a csv file'''
y_pred = model.predict(dtest)
results = stacked_pipeline.predict(finaltestset)
print(r2_score(best_results.y, y_pred*0.75 + results*0.25))

0.963386816037


* r2 without outlier model : 0.95291727524
* r2 with outlier model : 0.992047644027
* r2 without outlier and less model : 0.963386816037

In [64]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models2.csv', index=False)