In [1]:
###############################################################################
############################### Packages ######################################
import numpy as np
from scipy.stats import norm, skew, skewtest
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVR
%matplotlib inline
from IPython.core.display import display, HTML
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
### read the datasets
train = pd.read_csv('new_train2.csv')
test = pd.read_csv('new_test2.csv')
obj = train.select_dtypes(include=["object"]).columns.tolist()
train.drop(obj, axis=1, inplace=True)
test.drop(obj, axis=1, inplace=True)

In [3]:
print("Train shape is:", train.shape)

Train shape is: (1451, 228)


In [4]:
print("Test shape is:", test.shape)

Test shape is: (1459, 228)


In [5]:
### x_train, y_train, x_test
x_train = train.drop("SalePrice", axis=1)
y_train = train['SalePrice']
x_test = test.drop("SalePrice", axis=1)

In [6]:
x_train.shape

(1451, 227)

In [7]:
y_train.shape

(1451,)

In [8]:
x_test.shape

(1459, 227)

# Base Models and Cross Validation

Some code below refers to https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

In [9]:
#### Cross Validation Function
n_folds = 5

def rmsle_cv(model):
    rmse= np.sqrt(-cross_val_score(model, x_train.values, y_train, 
                                   scoring="neg_mean_squared_error", cv = n_folds))
    return(rmse)

In [10]:
### Lasso Regression
#lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005))
lasso = Lasso(alpha =0.0005)
score_lasso = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score_lasso.mean(), score_lasso.std()))


Lasso score: 0.1109 (0.0051)



In [11]:
### Kenel Ridge Regression
KRR = KernelRidge(alpha=0.5, kernel='polynomial', degree=2, coef0=2)
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.2327 (0.0277)



In [12]:
### Elastic Net
#param_grid = {
    #"alpha": [0.001, 0.00025, 0.0005, 0.00075, 0.0001,],
    #"l1_ratio": [0.1, 0.5, 0.9, 0.99]
    #}
#print('searching start...')
#grid = GridSearchCV(ElasticNet(), param_grid=param_grid, cv=5)
#print('searching end...')
#print('Best parameters found by grid search are:', grid.best_params_)
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.99))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1105 (0.0053)



In [14]:
### LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
score_lgb = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score_lgb.mean(), score_lgb.std()))

LGBM score: 0.1152 (0.0050)



In [None]:
### Gradient Boosting
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
### XGBoost
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# Stacking Base Models

In [18]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [19]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Prediction

In [25]:
dftest = pd.read_csv('test.csv')
ID_test = dftest['Id']

In [26]:
### choose base models and average the prediction
averaged_models = AveragingModels(models = (lasso, model_lgb))
score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1089 (0.0053)



In [27]:
### mean square error on the training set
averaged_models.fit(x_train, y_train)
averaged_train_pred = averaged_models.predict(x_train)
print(rmsle(y_train, averaged_train_pred))

0.0826726630284


In [28]:
averaged_test_pred = averaged_models.predict(x_test)
y_pred = np.exp(averaged_test_pred)-1

# Create a Submission CSV --- 0.11688 on Kaggle

In [29]:
sub = pd.DataFrame(data={"Id": ID_test,
                        "SalePrice": y_pred})

In [30]:
sub.head(5)

Unnamed: 0,Id,SalePrice
0,1461,117742.629277
1,1462,157721.761903
2,1463,183006.399993
3,1464,196926.147708
4,1465,195862.431726


In [31]:
sub.to_csv('submission_final.csv',index=False)