In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV, LinearRegression, HuberRegressor, RANSACRegressor,BayesianRidge
from sklearn.isotonic import IsotonicRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
class StackingEstimator(BaseEstimator, TransformerMixin):

    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed



In [3]:
probing = True

In [4]:
import pickle

if probing:
    train = pd.read_csv('../../data/train2.csv')    
else:
    train = pd.read_csv('../../data/train.csv')

test = pd.read_csv('../../data/test.csv')
original_train = pd.read_csv('../../data/train.csv')
usable_columns = original_train.drop('y', axis=1).columns

public_lb = pd.read_csv('../../data/public_lb.csv')
public_lb['ID'] = public_lb.id
public_lb['y'] = public_lb.yValue

best = pd.read_csv('v7.csv')

In [5]:
# usable_columns = usable_columns[:3].append(usable_columns[10:])
# usable_columns=['ID', 'X0', 'X47','X95','X314','X315','X232','X119','X311','X76','X329','X238','X340','X362','X137']

In [6]:
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components


# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values

finaltrainset = train[usable_columns]
finaltestset = test[usable_columns]

In [7]:
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()
)
final = xgb.XGBRegressor(n_trees= 520,
                eta=0.0045,
                max_depth= 4,
                subsample= 0.93,
                objective= 'reg:linear',
                eval_metric= 'rmse',
                silent= 1)
models =  [stacked_pipeline,
            LinearRegression(),
            RandomForestRegressor(),
            LinearSVR(),
           BayesianRidge(),
           
            DecisionTreeRegressor(max_depth=5),
            final]

In [8]:
def ensemble(models, final_model, train_data, eval_data,  train_label, eval_label, xtest):
    eval_preds = []
    test_data = []
    

    
    for model in models:
        model.fit(train_data, train_label)
        eval_preds.append(model.predict(eval_data))
        test_data.append(model.predict(xtest))
        
    eval_predsT = np.array(eval_preds).T
    final_model.fit(eval_predsT, eval_label)
    score = r2_score(eval_label, final_model.predict(eval_predsT))
    test_dataT = np.array(test_data).T
    
    return final_model.predict(test_dataT), score 

In [None]:
r = []
mae = []
mse = []
stack_preds = []
from sklearn.svm import LinearSVR
for i in range(10):
    print("++===============================++")
    train_X, eval_X, train_y, eval_y = train_test_split(finaltrainset, y_train, test_size=0.2 )
    cv_train_X, cv_eval_X, cv_train_y, cv_eval_y = train_test_split(train_X, train_y, test_size=0.6 )
    stack_pred, score = ensemble(models, final, cv_train_X, cv_eval_X, cv_train_y, cv_eval_y, eval_X)
    stack_preds.append(stack_pred)
    print("CV score : ", score)
    
    r.append(score)
    mse.append(np.sqrt(mean_squared_error(eval_y, stack_pred)))
    mae.append(mean_absolute_error(eval_y, stack_pred))
    print("++===============================++\n")

print("r2 to probe : ", np.mean(r))
print("rmse  to probe : ", np.mean(mse))
print("mae  to probe: ", np.mean(mae))

CV score :  0.747233184435

CV score :  0.750936524305

CV score :  0.756752412099

CV score :  0.772014370668

CV score :  0.739915975261

CV score :  0.745664934475

CV score :  0.734020028072

CV score :  0.752396593449

CV score :  0.745174126437

CV score :  0.734819021751

r2 to probe :  0.747892717095
rmse  to probe :  8.37853493803
mae  to probe:  5.4212132482


In [None]:
r = []
mae = []
mse = []
stack_preds = []
from sklearn.svm import LinearSVR
for i in range(10):
    print("++===============================++")
    train_X, eval_X, train_y, eval_y = train_test_split(finaltrainset, y_train, test_size=0.2 )
    stack_pred, score = ensemble(models, final, train_X,eval_X, train_y, eval_y,finaltestset)
    stack_preds.append(stack_pred)
    print("CV score : ", score)
    sub = pd.DataFrame()
    sub['ID'] = id_test
    sub['y'] = stack_pred#y_pred*0.75 + results*0.25
    res = sub[sub.ID.isin(public_lb.id)]
    print(i, "R2 score : ", r2_score(public_lb.yValue, res.y))

    r.append(r2_score(public_lb.yValue, res.y))
    mse.append(np.sqrt(mean_squared_error(public_lb.yValue, res.y)))
    mae.append(mean_absolute_error(public_lb.yValue, res.y))
    print("++===============================++\n")

print("r2 to probe : ", np.mean(r))
print("rmse  to probe : ", np.mean(mse))
print("mae  to probe: ", np.mean(mae))

CV score :  0.828567478066
0 R2 score :  0.611725399897

CV score :  0.841881297458
1 R2 score :  0.655805929224

CV score :  0.835074579992
2 R2 score :  0.461904940157

CV score :  0.854278639326
3 R2 score :  0.0753208838145

CV score :  0.797566575627
4 R2 score :  0.723703151302



In [None]:
stack_predsT = np.array(stack_preds).T
stack_predsTM = np.mean(stack_predsT,axis=1)
stack_predsTM.shape

In [None]:
# y_pred = model.predict(dtest)
# results = stacked_pipeline.predict(finaltestset)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = stack_predsTM#y_pred*0.75 + results*0.25
res = sub[sub.ID.isin(public_lb.id)]
result = r2_score(public_lb.yValue, res.y)
print("r2 to probe : ", r2_score(public_lb.yValue, res.y))
print("mse  to probe : ", mean_squared_error(public_lb.yValue, res.y))
print("mae  to probe: ", mean_absolute_error(public_lb.yValue, res.y))
exception = True

Base Results = 0.720429268928

In [None]:
if probing:
    first = sub[sub.ID.isin(public_lb.id) == False]
    second = public_lb[['ID', 'y']]
    sub2 = first.append(second, ignore_index=True)
    sub2 = sub2.sort_values('ID', ascending=True)
else:
    sub2 = sub

In [None]:
print(r2_score(best.y, sub2.y ))

In [None]:
#sanity check
r2_score(public_lb.yValue, sub2[sub2.ID.isin(public_lb.id)].y)

In [None]:
sanity_check= sub2[pd.isnull(sub2.y)].shape[0]
print(sub2.shape)
print(sanity_check == 0)
print(sub2.shape[0] == test.shape[0])
if sanity_check == 0 and sub2.shape[0] == test.shape[0] and (result + 0.001 > 0.720429268928 or exception):
    sub2.to_csv('stacked-models7.csv', index=False)
    print("saved")
else:
    print("below can't continue")