In [2]:
%store -r train_test_concat_norm
%store -r ntrain
%store -r y_train

In [3]:
train = train_test_concat_norm[:ntrain]
print("train shape: ", train.shape)
test = train_test_concat_norm[ntrain:]
print("test shape: ", test.shape)

train shape:  (1460, 214)
test shape:  (1459, 214)


In [86]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np 

scaler = MinMaxScaler()
scaler.fit(train)                
t_train = scaler.transform(train)
pca_hp = PCA(30)
train_fit = pca_hp.fit_transform(t_train)
print(pca_hp.explained_variance_ratio_)  

[0.13308861 0.07629765 0.05018999 0.04114579 0.0366458  0.03123351
 0.02825337 0.02794339 0.02694442 0.02370181 0.02251647 0.02086496
 0.01970315 0.01757088 0.01670338 0.01597611 0.01428365 0.01383827
 0.01288473 0.01280032 0.01195665 0.01082842 0.01066339 0.01015467
 0.00999883 0.00930671 0.0089928  0.00862306 0.00835598 0.00812948]


In [87]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(test)                
t_test = scaler.transform(test)
pca_hp = PCA(30)
test_fit = pca_hp.fit_transform(t_test)
print(pca_hp.explained_variance_ratio_)  

[0.13822218 0.08049612 0.04804562 0.04274775 0.03534637 0.0313161
 0.02996787 0.02634788 0.02586918 0.02302508 0.02168758 0.02097429
 0.02042911 0.01758363 0.01700897 0.01506977 0.01456104 0.01323929
 0.01305567 0.011799   0.01147923 0.01099419 0.01061986 0.01000103
 0.00964619 0.0095064  0.00905619 0.00863647 0.00827032 0.00805976]


### Split training data 

In [125]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
                                   train, y_train, random_state=42, test_size=.33)

### Cross validation

In [126]:
#Try with k-fold cross validation
from sklearn.model_selection import *
from sklearn import linear_model
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.metrics import *
import pandas as pd

In [127]:
kf = KFold(n_splits=10, random_state=None)
kf.get_n_splits(X_train)

10

### Elastic

In [91]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

# Elastic Net Regression 
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [9]:
score = cross_val_score(ENet,X_train,Y_train, cv=kf )
print("score %i:  " %i, score)
print ("Folds: %i, mean_score: %.2f, std:%.2f" %(len(score),np.mean(np.abs(score)),np.std(score)))

NameError: name 'X_train2' is not defined

In [None]:
predict = cross_val_predict(ENet,X_train,Y_train, cv=kf)
print("Mean squared Error : " + str(mean_squared_error(Y_train,predict)))

### Gradient boosting regression

In [92]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

 
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
score = cross_val_score(GBoost,X_train,Y_train, cv=kf )
print("score %i:  " %i, score)
print ("Folds: %i, mean_score: %.2f, std:%.2f" %(len(score),np.mean(np.abs(score)),np.std(score)))

In [None]:
predict = cross_val_predict(GBoost,X_train,Y_train, cv=kf)
print("Mean squared Error : " + str(mean_squared_error(Y_train,predict)))

### LightGBM

In [93]:
# LightGBM
import lightgbm as lgb
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
score = cross_val_score(model_lgb,X_train,Y_train, cv=kf )
print("score %i:  " %i, score)
print ("Folds: %i, mean_score: %.2f, std:%.2f" %(len(score),np.mean(np.abs(score)),np.std(score)))

In [None]:
predict = cross_val_predict(model_lgb,X_train,Y_train, cv=kf)
print("Mean squared Error : " + str(mean_squared_error(Y_train,predict)))

### Average model


In [119]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)

In [120]:
averaged_models = AveragingModels(models = (ENet, GBoost, model_lgb))

- Fit with train test split

In [128]:
averaged_models.fit(X_train, Y_train)

AveragingModels(models=(Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('elasticnet', ElasticNet(alpha=0.0005, copy_X=True, fit_intercept=True, l1_ratio=0.9,
      max_iter=1000, normalize=False, positive=False...0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)))

In [129]:
print ("Test Score: \n", averaged_models.score(X_test, Y_test))

Test Score: 
 0.9140506887078994


In [130]:
predictions = averaged_models.predict(X_test)
print("Mean squared Error : " + str(mean_squared_error(Y_test,predictions)))

Mean squared Error : 0.014648460018473045


- Fit with cross validation

In [109]:
score = cross_val_score(averaged_models,train,y_train, cv=kf )

# Score model
print("score: ", score)
print ("Folds: %i, mean_score: %.2f, std:%.2f" %(len(score),np.mean(np.abs(score)),np.std(score)))

score:  [0.91320345 0.94334892 0.94734494 0.86212771 0.90542481 0.91819866
 0.91701477 0.93326719 0.87030511 0.90461363]
Folds: 10, mean_score: 0.91, std:0.03


In [None]:
predict = cross_val_predict(averaged_models,train,y_train, cv=kf)
print("Mean squared Error : " + str(mean_squared_error(y_train,predict)))

### Test with Neuron network instead of average model 

### Make a submission

In [131]:
%store -r test_Id
submission = pd.DataFrame()
submission["Id"] = test_Id

In [132]:
# Apply PCA to the test dataset 
prediction = averaged_models.predict(test)

In [100]:
# Return normal value after log transform the target in training
final_predictions = np.exp(prediction)

In [101]:
submission['SalePrice'] = final_predictions

In [102]:
print(final_predictions[:10,])

[140046.45230726 229895.37260357 165598.64991275 186468.99959039
 196791.75398933 198502.63309164 143251.53373982 177213.7351576
 158858.63903032 136524.7610255 ]


In [103]:
submission.to_csv('submission_averged_models_Averaging_CrossV.csv', index=False)