In [35]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array

In [25]:
import warnings
warnings.filterwarnings("ignore")

## 6. Other models

In [2]:
# data
dt_model = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_preprocess.csv")

In [3]:
# remove ohe
dt_model_features = dt_model.drop(dt_model.filter(regex = "Encode_ohe").columns, axis = 1)
dt_model_features.shape

(8418, 1364)

In [4]:
# X, y, ID
X_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis = 1)
X_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis = 1)
y_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].y.values
y_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].y.values
ID_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].ID.values
ID_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].ID.values

In [10]:
# stack
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

### 6.1 Linear

In [44]:
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator = LassoLarsCV(normalize = True, cv = 5))
    , StackingEstimator(estimator = GradientBoostingRegressor(learning_rate = 0.001, loss = "huber", max_depth = 3, max_features = 0.55
                                                              , min_samples_leaf = 18, min_samples_split = 14, subsample = 0.7))
    , LassoLarsCV(cv = 5)

)

In [45]:
k = 10
# bin
bin_y = pd.qcut(y_train_all, k, labels = [i for i in range(1, k + 1)]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 888)

score_skf_valid = 0
score_skf = 0
preds_skf_test = []
for i, (ind_train, ind_valid) in enumerate(skf.split(X_train_all, bin_y)):
    # X, y
    X_train, X_valid = X_train_all.iloc[ind_train], X_train_all.iloc[ind_valid]
    y_train, y_valid = y_train_all[ind_train], y_train_all[ind_valid]
    
    stacked_pipeline.fit(X_train, y_train)
    
    preds_valid = stacked_pipeline.predict(X_valid)
    score_skf_valid = r2_score(y_valid, preds_valid)
    print('Fold %d: Score %f'%(i, score_skf_valid))

    score_skf += score_skf_valid

score_skf /= k
print('=====================')

print( 'Final Score %f'%score_skf)

print('=====================')

Fold 0: Score 0.522337
Fold 1: Score 0.634774
Fold 2: Score 0.633094
Fold 3: Score 0.601006
Fold 4: Score 0.554288
Fold 5: Score 0.395096
Fold 6: Score 0.552692
Fold 7: Score 0.582648
Fold 8: Score 0.608986
Fold 9: Score 0.610909
Final Score 0.569583


### Submit

In [47]:
# predict
y_test = stacked_pipeline.predict(X_test)
y_test[:10]

array([  79.90089004,   94.10677142,   79.28796135,   79.37421186,
        111.68541258,   94.06358613,  111.68541258,   94.09354929,
        115.27120233,   94.23010788])

In [48]:
# submit
dt_submit = pd.DataFrame({"ID": ID_test, "y": y_test})

In [49]:
dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/16_Lasso_GBR_Lasso.csv", index = False)